RDKit
Open-source cheminformatics and machine learning.
HierarchicalClusterPicker.h
Go to the documentation of this file.
1 //
2 // Copyright (C) 2003-2006 Rational Discovery LLC
3 //
4 // @@ All Rights Reserved @@
5 // This file is part of the RDKit.
6 // The contents are covered by the terms of the BSD license
7 // which is included in the file license.txt, found at the root
8 // of the RDKit source tree.
9 //
10 #ifndef _HIERARCHCLUSTERPICKER_H
11 #define _HIERARCHCLUSTERPICKER_H
12 
13 #include <RDGeneral/types.h>
14 #include "DistPicker.h"
15 
16 namespace RDPickers {
17 
18 /*! \brief Diversity picker based on hierarchical clustering
19  *
20  * This class inherits from DistPicker since it uses the distance matrix
21  * for diversity picking. The clustering itself is done using the Murtagh
22  * code in $RDBASE/Code/ML/Cluster/Mutagh/
23  */
25  public:
26  /*! \brief The type of hierarchical clustering algorithm to use
27  */
28  typedef enum {
29  WARD = 1,
30  SLINK = 2,
31  CLINK = 3,
32  UPGMA = 4,
33  MCQUITTY = 5,
34  GOWER = 6,
36  } ClusterMethod;
37 
38  /*! \brief Constructor - takes a ClusterMethod as an argument
39  *
40  * Sets the hierarch clustering method
41  */
42  explicit HierarchicalClusterPicker(ClusterMethod clusterMethod)
43  : d_method(clusterMethod) {
44  ;
45  };
46 
47  /*! \brief This is the function that does the picking
48  *
49  * Here is how the algorithm works \n
50  * FIX: Supply reference
51  *
52  * - The entire pool is clustered using the distance matrix using one of the
53  * hierachical clustering method (specified via the constructor). \n
54  * - Starting with the individaul items in the pool, clusters are merged based
55  * on the output from clustering method. \n
56  * - The merging is stopped when the number of clusters is same as
57  * the number of picks.
58  * - For each item in a cluster the sum of square of the distances to the rest
59  *of
60  * of the items (in the cluster) is computed. The item with the smallest of
61  *values is
62  * picked as a representative of the cluster. Basically trying to pick the
63  *item closest
64  * to the centroid of the cluster.
65  *
66  *
67  * \param distMat - distance matrix - a vector of double. It is assumed
68  *that only the
69  * lower triangle element of the matrix are supplied in a 1D
70  *array\n
71  * NOTE: this matrix WILL BE ALTERED during the picking\n
72  * \param poolSize - the size of the pool to pick the items from. It is
73  *assumed that the
74  * distance matrix above contains the right number of elements;
75  *i.e.
76  * poolSize*(poolSize-1) \n
77  * \param pickSize - the number items to pick from pool (<= poolSize)
78  */
79  RDKit::INT_VECT pick(const double *distMat, unsigned int poolSize,
80  unsigned int pickSize) const;
81 
82  /*! \brief This is the function that does the clustering of the items - used
83  *by the picker
84  *
85  * ARGUMENTS:
86  *
87  * \param distMat - distance matrix - a vector of double. It is assumed that
88  *only the
89  * lower triangle element of the matrix are supplied in a 1D
90  *array\n
91  * NOTE: this matrix WILL BE ALTERED during the picking\n
92  * \param poolSize - the size of the pool to pick the items from. It is
93  *assumed that the
94  * distance matrix above contains the right number of elements;
95  *i.e.
96  * poolSize*(poolSize-1) \n
97  * \param pickSize - the number clusters to divide the pool into (<=
98  *poolSize)
99  */
100  RDKit::VECT_INT_VECT cluster(const double *distMat, unsigned int poolSize,
101  unsigned int pickSize) const;
102 
103  private:
104  ClusterMethod d_method;
105 };
106 };
107 
108 #endif
Diversity picker based on hierarchical clustering.
std::vector< INT_VECT > VECT_INT_VECT
Definition: types.h:202
std::vector< int > INT_VECT
Definition: types.h:188
RDKit::INT_VECT pick(const double *distMat, unsigned int poolSize, unsigned int pickSize) const
This is the function that does the picking.
Abstract base class to do perform item picking (typically molecules) using a distance matrix...
Definition: DistPicker.h:43
RDKit::VECT_INT_VECT cluster(const double *distMat, unsigned int poolSize, unsigned int pickSize) const
This is the function that does the clustering of the items - used by the picker.
ClusterMethod
The type of hierarchical clustering algorithm to use.
HierarchicalClusterPicker(ClusterMethod clusterMethod)
Constructor - takes a ClusterMethod as an argument.