MaxMinPicker.h

Go to the documentation of this file.
00001 //
00002 //  Copyright (C) 2003-2007 Greg Landrum and Rational Discovery LLC
00003 //
00004 //   @@ All Rights Reserved  @@
00005 //
00006 #ifndef __RD_MAXMINPICKER_H__
00007 #define __RD_MAXMINPICKER_H__
00008 
00009 #include <RDGeneral/types.h>
00010 #include <RDGeneral/utils.h>
00011 #include <RDGeneral/Invariant.h>
00012 #include <RDGeneral/RDLog.h>
00013 #include <RDBoost/Exceptions.h>
00014 #include <cstdlib>
00015 #include "DistPicker.h"
00016 
00017 namespace RDPickers {
00018 
00019   namespace {
00020     class distmatFunctor{
00021     public:
00022       distmatFunctor(const double *distMat) : dp_distMat(distMat) {};
00023       double operator()(unsigned int i,unsigned int j) {
00024         return getDistFromLTM(this->dp_distMat,i,j);
00025       }
00026     private:
00027       const double *dp_distMat;
00028     };
00029   }
00030 
00031   /*! \brief Implements the MaxMin algorithm for picking a subset of item from a pool
00032    *
00033    *  This class inherits from the DistPicker and implements a specific picking strategy
00034    *  aimed at diversity. See documentation for "pick()" member function for the algorithm details
00035    */
00036   class MaxMinPicker : public DistPicker {
00037   public:
00038     /*! \brief Default Constructor
00039      *
00040      */
00041     MaxMinPicker() {};
00042 
00043     /*! \brief Contains the implementation for a lazy MaxMin diversity picker
00044      *
00045      * See the documentation for the pick() method for details about the algorithm
00046      *
00047      *   \param func - a function (or functor) taking two unsigned ints as arguments
00048      *              and returning the distance (as a double) between those two elements.   
00049      *   \param poolSize - the size of the pool to pick the items from. It is assumed that the
00050      *              distance matrix above contains the right number of elements; i.e.
00051      *              poolSize*(poolSize-1) \n
00052      *   \param pickSize - the number items to pick from pool (<= poolSize)
00053      *   \param firstPicks - (optional)the first items in the pick list
00054      */
00055     template <typename T>
00056     RDKit::INT_VECT lazyPick(T &func, 
00057                              unsigned int poolSize, unsigned int pickSize,
00058                              RDKit::INT_VECT firstpicks=RDKit::INT_VECT()) const;
00059 
00060     /*! \brief Contains the implementation for the MaxMin diversity picker
00061      *
00062      * Here is how the picking algorithm works, refer to
00063      *   Ashton, M. et. al., Quant. Struct.-Act. Relat., 21 (2002), 598-604
00064      * for more detail:
00065      *
00066      * A subset of k items is to be selected from a pool containing N molecules. 
00067      * Then the MaxMin method is as follows:
00068      *  -# Initialise Subset with some appropriately chosen seed
00069      *     compound and set x = 1.
00070      *  -# For each of the N-x remaining compounds in Dataset
00071      *     calculate its dissimilarity with each of the x compounds in
00072      *     Subset and retain the smallest of these x dissimilarities for
00073      *     each compound in Dataset.
00074      *  -# Select the molecule from Dataset with the largest value
00075      *     for the smallest dissimilarity calculated in Step 2 and
00076      *     transfer it to Subset.
00077      *  -# Set x = x + 1 and return to Step 2 if x < k.
00078      *
00079      *  
00080      *
00081      *   \param distMat - distance matrix - a vector of double. It is assumed that only the 
00082      *              lower triangle element of the matrix are supplied in a 1D array\n
00083      *   \param poolSize - the size of the pool to pick the items from. It is assumed that the
00084      *              distance matrix above contains the right number of elements; i.e.
00085      *              poolSize*(poolSize-1) \n
00086      *   \param pickSize - the number items to pick from pool (<= poolSize)
00087      *   \param firstPicks - indices of the items used to seed the pick set.
00088     */
00089     RDKit::INT_VECT pick(const double *distMat, 
00090                          unsigned int poolSize, unsigned int pickSize,
00091                          RDKit::INT_VECT firstPicks) const {
00092       CHECK_INVARIANT(distMat, "Invalid Distance Matrix");
00093       if(poolSize<pickSize)
00094         throw ValueErrorException("pickSize cannot be larger than the poolSize");
00095       distmatFunctor functor(distMat);
00096       return this->lazyPick(functor,poolSize,pickSize,firstPicks);
00097     }
00098 
00099     /*! \overload */
00100     RDKit::INT_VECT pick(const double *distMat, 
00101                          unsigned int poolSize, unsigned int pickSize) const {
00102       RDKit::INT_VECT iv;
00103       return pick(distMat,poolSize,pickSize,iv);
00104     }
00105 
00106 
00107 
00108   };
00109   // we implement this here in order to allow arbitrary functors without link errors
00110   template <typename T>
00111   RDKit::INT_VECT MaxMinPicker::lazyPick(T &func,
00112                                          unsigned int poolSize, unsigned int pickSize,
00113                                          RDKit::INT_VECT firstPicks) const {
00114     if(poolSize<pickSize)
00115       throw ValueErrorException("pickSize cannot be larger than the poolSize");
00116 
00117     RDKit::INT_LIST pool;
00118 
00119     RDKit::INT_VECT picks;
00120     picks.reserve(pickSize);
00121     unsigned int pick=0;
00122 
00123     // enter the pool into a list so that we can pick out of it easily
00124     for (unsigned int i = 0; i < poolSize; i++) {
00125       pool.push_back(i);
00126     }
00127 
00128     // pick the first entry
00129     if(!firstPicks.size()){
00130       pick = rand()%poolSize;
00131       // add the pick to the picks
00132       picks.push_back(pick);
00133       // and remove it from the pool
00134       pool.remove(pick);
00135     } else{
00136       for(RDKit::INT_VECT::const_iterator pIdx=firstPicks.begin();
00137           pIdx!=firstPicks.end();++pIdx){
00138         pick = static_cast<unsigned int>(*pIdx);
00139         if(pick>=poolSize)
00140           throw ValueErrorException("pick index was larger than the poolSize");
00141         picks.push_back(pick);
00142         pool.remove(pick);
00143       }
00144     }
00145     // now pick 1 compound at a time
00146     while (picks.size() < pickSize) {
00147       double maxOFmin = -1.0;
00148       RDKit::INT_LIST_I plri=pool.end();
00149       for(RDKit::INT_LIST_I pli=pool.begin();
00150           pli!=pool.end(); ++pli){
00151         unsigned int poolIdx = (*pli);
00152         double minTOi = RDKit::MAX_DOUBLE;
00153         for (RDKit::INT_VECT_CI pi = picks.begin(); 
00154              pi != picks.end(); ++pi) {
00155           unsigned int pickIdx = (*pi);
00156           CHECK_INVARIANT(poolIdx!=pickIdx,"");
00157           double dist = func(poolIdx,pickIdx);
00158           if (dist <= minTOi) {
00159             minTOi = dist;
00160           }
00161         }
00162         if (minTOi > maxOFmin || (RDKit::feq(minTOi,maxOFmin) && poolIdx<pick) ) {
00163           maxOFmin = minTOi;
00164           pick = poolIdx;
00165           plri = pli;
00166         }
00167       }
00168       
00169       // now add the new pick to  picks and remove it from the pool
00170       picks.push_back(pick);
00171       CHECK_INVARIANT(plri!=pool.end(),"");
00172       pool.erase(plri);
00173     }
00174     return picks;
00175   }
00176 
00177 
00178 };
00179 
00180 #endif

Generated on Fri Apr 3 06:03:02 2009 for RDCode by  doxygen 1.5.6