MaxMinPicker.h

Go to the documentation of this file.
00001 //
00002 //  Copyright (C) 2003-2007 Greg Landrum and Rational Discovery LLC
00003 //
00004 //   @@ All Rights Reserved  @@
00005 //
00006 #ifndef _RD_MAXMINPICKER_H_
00007 #define _RD_MAXMINPICKER_H_
00008 
00009 #include <RDGeneral/types.h>
00010 #include <RDGeneral/utils.h>
00011 #include <RDGeneral/Invariant.h>
00012 #include <RDGeneral/RDLog.h>
00013 #include <RDBoost/Exceptions.h>
00014 #include <cstdlib>
00015 #include "DistPicker.h"
00016 
00017 namespace RDPickers {
00018 
00019   namespace {
00020     class distmatFunctor{
00021     public:
00022       distmatFunctor(const double *distMat) : dp_distMat(distMat) {};
00023       double operator()(unsigned int i,unsigned int j) {
00024         return getDistFromLTM(this->dp_distMat,i,j);
00025       }
00026     private:
00027       const double *dp_distMat;
00028     };
00029   }
00030 
00031   /*! \brief Implements the MaxMin algorithm for picking a subset of item from a pool
00032    *
00033    *  This class inherits from the DistPicker and implements a specific picking strategy
00034    *  aimed at diversity. See documentation for "pick()" member function for the algorithm details
00035    */
00036   class MaxMinPicker : public DistPicker {
00037   public:
00038     /*! \brief Default Constructor
00039      *
00040      */
00041     MaxMinPicker() {};
00042 
00043     /*! \brief Contains the implementation for a lazy MaxMin diversity picker
00044      *
00045      * See the documentation for the pick() method for details about the algorithm
00046      *
00047      *   \param func - a function (or functor) taking two unsigned ints as arguments
00048      *              and returning the distance (as a double) between those two elements.   
00049      *   \param poolSize - the size of the pool to pick the items from. It is assumed that the
00050      *              distance matrix above contains the right number of elements; i.e.
00051      *              poolSize*(poolSize-1) \n
00052      *   \param pickSize - the number items to pick from pool (<= poolSize)
00053      *   \param firstPicks - (optional)the first items in the pick list
00054      */
00055     template <typename T>
00056     RDKit::INT_VECT lazyPick(T &func, 
00057                              unsigned int poolSize, unsigned int pickSize,
00058                              RDKit::INT_VECT firstpicks=RDKit::INT_VECT()) const;
00059 
00060     /*! \brief Contains the implementation for the MaxMin diversity picker
00061      *
00062      * Here is how the picking algorithm works, refer to
00063      *   Ashton, M. et. al., Quant. Struct.-Act. Relat., 21 (2002), 598-604
00064      * for more detail:
00065      *
00066      * A subset of k items is to be selected from a pool containing N molecules. 
00067      * Then the MaxMin method is as follows:
00068      *  1. Initialise Subset with a randomly chosen seed
00069      *     compound and set x = 1.
00070      *  2. For each of the N-x remaining compounds in Dataset
00071      *     calculate its dissimilarity with each of the x compounds in
00072      *     Subset and retain the smallest of these x dissimilarities for
00073      *     each compound in Dataset.
00074      *  3. Select the molecule from Dataset with the largest value
00075      *     for the smallest dissimilarity calculated in Step 2 and
00076      *     transfer it to Subset.
00077      *  4. Set x = x + 1 and return to Step 2 if x < k.
00078      *
00079      *   \param distMat - distance matrix - a vector of double. It is assumed that only the 
00080      *              lower triangle element of the matrix are supplied in a 1D array\n
00081      *   \param poolSize - the size of the pool to pick the items from. It is assumed that the
00082      *              distance matrix above contains the right number of elements; i.e.
00083      *              poolSize*(poolSize-1) \n
00084      *   \param pickSize - the number items to pick from pool (<= poolSize)
00085      *   \param firstPicks - indices of the items used to seed the pick set.
00086     */
00087     RDKit::INT_VECT pick(const double *distMat, 
00088                          unsigned int poolSize, unsigned int pickSize,
00089                          RDKit::INT_VECT firstPicks) const {
00090       CHECK_INVARIANT(distMat, "Invalid Distance Matrix");
00091       if(poolSize<pickSize)
00092         throw ValueErrorException("pickSize cannot be larger than the poolSize");
00093       distmatFunctor functor(distMat);
00094       return this->lazyPick(functor,poolSize,pickSize,firstPicks);
00095     }
00096 
00097     /*! \overload */
00098     RDKit::INT_VECT pick(const double *distMat, 
00099                          unsigned int poolSize, unsigned int pickSize) const {
00100       RDKit::INT_VECT iv;
00101       return pick(distMat,poolSize,pickSize,iv);
00102     }
00103 
00104 
00105 
00106   };
00107   // we implement this here in order to allow arbitrary functors without link errors
00108   template <typename T>
00109   RDKit::INT_VECT MaxMinPicker::lazyPick(T &func,
00110                                          unsigned int poolSize, unsigned int pickSize,
00111                                          RDKit::INT_VECT firstPicks) const {
00112     if(poolSize<pickSize)
00113       throw ValueErrorException("pickSize cannot be larger than the poolSize");
00114 
00115     RDKit::INT_LIST pool;
00116 
00117     RDKit::INT_VECT picks;
00118     picks.reserve(pickSize);
00119     unsigned int pick=0;
00120 
00121     // enter the pool into a list so that we can pick out of it easily
00122     for (unsigned int i = 0; i < poolSize; i++) {
00123       pool.push_back(i);
00124     }
00125 
00126     // pick the first entry
00127     if(!firstPicks.size()){
00128       pick = rand()%poolSize;
00129       // add the pick to the picks
00130       picks.push_back(pick);
00131       // and remove it from the pool
00132       pool.remove(pick);
00133     } else{
00134       for(RDKit::INT_VECT::const_iterator pIdx=firstPicks.begin();
00135           pIdx!=firstPicks.end();++pIdx){
00136         pick = static_cast<unsigned int>(*pIdx);
00137         if(pick>=poolSize)
00138           throw ValueErrorException("pick index was larger than the poolSize");
00139         picks.push_back(pick);
00140         pool.remove(pick);
00141       }
00142     }
00143     // now pick 1 compound at a time
00144     while (picks.size() < pickSize) {
00145       double maxOFmin = -1.0;
00146       RDKit::INT_LIST_I plri=pool.end();
00147       for(RDKit::INT_LIST_I pli=pool.begin();
00148           pli!=pool.end(); ++pli){
00149         unsigned int poolIdx = (*pli);
00150         double minTOi = RDKit::MAX_DOUBLE;
00151         for (RDKit::INT_VECT_CI pi = picks.begin(); 
00152              pi != picks.end(); ++pi) {
00153           unsigned int pickIdx = (*pi);
00154           CHECK_INVARIANT(poolIdx!=pickIdx,"");
00155           double dist = func(poolIdx,pickIdx);
00156           if (dist <= minTOi) {
00157             minTOi = dist;
00158           }
00159         }
00160         if (minTOi > maxOFmin || (RDKit::feq(minTOi,maxOFmin) && poolIdx<pick) ) {
00161           maxOFmin = minTOi;
00162           pick = poolIdx;
00163           plri = pli;
00164         }
00165       }
00166       
00167       // now add the new pick to  picks and remove it from the pool
00168       picks.push_back(pick);
00169       CHECK_INVARIANT(plri!=pool.end(),"");
00170       pool.erase(plri);
00171     }
00172     return picks;
00173   }
00174 
00175 
00176 };
00177 
00178 #endif

Generated on Sat May 24 08:36:32 2008 for RDCode by  doxygen 1.5.3