HierarchicalClusterPicker.h

Go to the documentation of this file.
00001 //
00002 //  Copyright (C) 2003-2006 Rational Discovery LLC
00003 //
00004 //   @@ All Rights Reserved  @@
00005 //
00006 #ifndef _HIERARCHCLUSTERPICKER_H
00007 #define _HIERARCHCLUSTERPICKER_H
00008 
00009 #include <RDGeneral/types.h>
00010 #include "DistPicker.h"
00011 
00012 namespace RDPickers {
00013   
00014   /*! \brief Diversity picker based on hierarchical clustering
00015    *  
00016    *  This class inherits from DistPicker since it uses the distance matrix
00017    *  for diversity picking. The clustering itself is done using the Murtagh 
00018    *  code in $RDBASE/Code/ML/Cluster/Mutagh/
00019    */
00020   class HierarchicalClusterPicker : public DistPicker {
00021   public:
00022 
00023     /*! \brief The type of hierarchical clustering algorithm to use
00024      */
00025     typedef enum {
00026       WARD=1,
00027       SLINK=2,
00028       CLINK=3,
00029       UPGMA=4,
00030       MCQUITTY=5,
00031       GOWER=6,
00032       CENTROID=7 } ClusterMethod;
00033 
00034     /*! \brief Constructor - takes a ClusterMethod as an argument
00035      *
00036      * Sets the hierarch clustering method
00037      */
00038     explicit HierarchicalClusterPicker(ClusterMethod clusterMethod) : d_method(clusterMethod) {;};
00039 
00040     /*! \brief This is the function that does the picking
00041      *
00042      * Here is how the algorithm works \n
00043      *  FIX: Supply reference
00044      *
00045      * - The entire pool is clustered using the distance matrix using one of the 
00046      *   hierachical clustering method (specified via the constructor). \n
00047      * - Starting with the individaul items in the pool, clusters are merged based 
00048      *   on the output from clustering method. \n
00049      * - The merging is stopped when the number of clusters is same as 
00050      *   the number of picks.
00051      * - For each item in a cluster the sum of square of the distances to the rest of
00052      *   of the items (in the cluster) is computed. The item with the smallest of values is
00053      *   picked as a representative of the cluster. Basically trying to pick the item closest
00054      *   to the centroid of the cluster. 
00055      *
00056      *
00057      *    \param distMat - distance matrix - a vector of double. It is assumed that only the 
00058      *              lower triangle element of the matrix are supplied in a 1D array\n
00059      *              NOTE: this matrix WILL BE ALTERED during the picking\n
00060      *    \param poolSize - the size of the pool to pick the items from. It is assumed that the
00061      *              distance matrix above contains the right number of elements; i.e.
00062      *              poolSize*(poolSize-1) \n
00063      *    \param pickSize - the number items to pick from pool (<= poolSize)
00064      */
00065     RDKit::INT_VECT pick(const double *distMat, unsigned int poolSize, unsigned int pickSize) const ;
00066 
00067     /*! \brief This is the function that does the clustering of the items - used by the picker
00068      *
00069      * ARGUMENTS:
00070      *
00071      *   \param distMat - distance matrix - a vector of double. It is assumed that only the 
00072      *              lower triangle element of the matrix are supplied in a 1D array\n
00073      *              NOTE: this matrix WILL BE ALTERED during the picking\n
00074      *   \param poolSize - the size of the pool to pick the items from. It is assumed that the
00075      *              distance matrix above contains the right number of elements; i.e.
00076      *              poolSize*(poolSize-1) \n
00077      *   \param pickSize - the number clusters to divide the pool into (<= poolSize)
00078      */
00079     RDKit::VECT_INT_VECT cluster(const double *distMat, unsigned int poolSize, unsigned int pickSize) const;
00080 
00081   private:
00082     ClusterMethod d_method;
00083   };
00084 };
00085 
00086 #endif

Generated on Sat May 24 08:36:32 2008 for RDCode by  doxygen 1.5.3