Package rdkit :: Package ML :: Package Cluster :: Module Clusters
[hide private]
[frames] | no frames]

Source Code for Module rdkit.ML.Cluster.Clusters

  1  # $Id: Clusters.py 997 2009-02-25 06:12:43Z glandrum $ 
  2  # 
  3  # Copyright (C) 2001-2008  greg Landrum 
  4  # 
  5  #   @@ All Rights Reserved  @@ 
  6  # 
  7  """ contains the Cluster class for representing hierarchical cluster trees 
  8   
  9  """ 
 10  import numpy 
 11   
 12  CMPTOL=1e-6 
 13   
14 -class Cluster(object):
15 """a class for storing clusters/data 16 17 **General Remarks** 18 19 - It is assumed that the bottom of any cluster hierarchy tree is composed of 20 the individual data points which were clustered. 21 22 - Clusters objects store the following pieces of data, most are 23 accessible via standard Setters/Getters: 24 25 - Children: *Not Settable*, the list of children. You can add children 26 with the _AddChild()_ and _AddChildren()_ methods. 27 28 **Note** this can be of arbitrary length, 29 but the current algorithms I have only produce trees with two children 30 per cluster 31 32 - Metric: the metric for this cluster (i.e. how far apart its children are) 33 34 - Index: the order in which this cluster was generated 35 36 - Points: *Not Settable*, the list of original points in this cluster 37 (calculated recursively from the children) 38 39 - PointsPositions: *Not Settable*, the list of positions of the original 40 points in this cluster (calculated recursively from the children) 41 42 - Position: the location of the cluster **Note** for a cluster this 43 probably means the location of the average of all the Points which are 44 its children. 45 46 - Data: a data field. This is used with the original points to store their 47 data value (i.e. the value we're using to classify) 48 49 - Name: the name of this cluster 50 51 """
52 - def __init__(self,metric=0.0,children=None,position=None,index=-1,name=None,data=None):
53 """Constructor 54 55 **Arguments** 56 57 see the class documentation for the meanings of these arguments 58 59 *my wrists are tired* 60 61 """ 62 if children is None: 63 children = [] 64 if position is None: 65 position = [] 66 self.metric = metric 67 self.children = children 68 self._UpdateLength() 69 self.pos = position 70 self.index = index 71 self.name = name 72 self._points = None 73 self._pointsPositions = None 74 self.data = data
75
76 - def SetMetric(self,metric):
77 self.metric = metric
78 - def GetMetric(self):
79 return self.metric
80
81 - def SetIndex(self,index):
82 self.index = index
83 - def GetIndex(self):
84 return self.index
85
86 - def SetPosition(self,pos):
87 self.pos = pos
88 - def GetPosition(self):
89 return self.pos
90
91 - def GetPointsPositions(self):
92 if self._pointsPositions is not None: 93 return self._pointsPositions 94 else: 95 self._GenPoints() 96 return self._pointsPositions
97
98 - def GetPoints(self):
99 if self._points is not None: 100 return self._points 101 else: 102 self._GenPoints() 103 return self._points
104
105 - def FindSubtree(self,index):
106 """ finds and returns the subtree with a particular index 107 """ 108 res = None 109 if index == self.index: 110 res = self 111 else: 112 for child in self.children: 113 res = child.FindSubtree(index) 114 if res: 115 break 116 return res
117
118 - def _GenPoints(self):
119 """ Generates the _Points_ and _PointsPositions_ lists 120 121 *intended for internal use* 122 123 """ 124 if len(self) == 1: 125 self._points = [self] 126 self._pointsPositions = [self.GetPosition()] 127 return self._points 128 else: 129 res = [] 130 children = self.GetChildren() 131 children.sort(lambda x,y:cmp(len(y),len(x))) 132 for child in children: 133 res += child.GetPoints() 134 self._points=res 135 self._pointsPositions = [x.GetPosition() for x in res]
136
137 - def AddChild(self,child):
138 """Adds a child to our list 139 140 **Arguments** 141 142 - child: a Cluster 143 144 """ 145 self.children.append(child) 146 self._GenPoints() 147 self._UpdateLength()
148 - def AddChildren(self,children):
149 """Adds a bunch of children to our list 150 151 **Arguments** 152 153 - children: a list of Clusters 154 155 """ 156 self.children += children 157 self._GenPoints() 158 self._UpdateLength()
159 - def RemoveChild(self,child):
160 """Removes a child from our list 161 162 **Arguments** 163 164 - child: a Cluster 165 166 """ 167 self.children.remove(child) 168 self._UpdateLength()
169
170 - def GetChildren(self):
171 self.children.sort(lambda x,y:cmp(x.GetMetric(),y.GetMetric())) 172 return self.children
173
174 - def SetData(self,data):
175 self.data = data
176 - def GetData(self):
177 return self.data
178
179 - def SetName(self,name):
180 self.name = name
181 - def GetName(self):
182 if self.name is None: 183 return 'Cluster(%d)'%(self.GetIndex()) 184 else: 185 return self.name
186
187 - def Print(self,level=0,showData=0,offset='\t'):
188 if not showData or self.GetData() is None: 189 print '%s%s%s Metric: %f'%(' '*level,self.GetName(),offset,self.GetMetric()) 190 else: 191 print '%s%s%s Data: %f\t Metric: %f'%(' '*level,self.GetName(),offset,self.GetData(),self.GetMetric()) 192 193 for child in self.GetChildren(): 194 child.Print(level=level+1,showData=showData,offset=offset)
195
196 - def Compare(self,other,ignoreExtras=1):
197 """ not as choosy as self==other 198 199 """ 200 if cmp(type(self),type(other)): 201 return cmp(type(self),type(other)) 202 203 if cmp(len(self),len(other)): 204 return cmp(len(self),len(other)) 205 206 if not ignoreExtras: 207 m1,m2=self.GetMetric(),other.GetMetric() 208 if abs(m1-m2)>CMPTOL: 209 return cmp(m1,m2) 210 211 if cmp(self.GetName(),other.GetName()): 212 return cmp(self.GetName(),other.GetName()) 213 214 sP = self.GetPosition() 215 oP = other.GetPosition() 216 try: 217 r = cmp(len(sP),len(oP)) 218 except: 219 pass 220 else: 221 if r: 222 return r 223 224 try: 225 r = cmp(sP,oP) 226 except: 227 r = sum(sP-oP) 228 if r: 229 return r 230 231 c1,c2=self.GetChildren(),other.GetChildren() 232 if cmp(len(c1),len(c2)): 233 return cmp(len(c1),len(c2)) 234 for i in range(len(c1)): 235 t = c1[i].Compare(c2[i],ignoreExtras=ignoreExtras) 236 if t: 237 return t 238 239 return 0
240
241 - def _UpdateLength(self):
242 """ updates our length 243 244 *intended for internal use* 245 246 """ 247 self._len = reduce(lambda x,y: len(y)+x,self.children,1)
248
249 - def IsTerminal(self):
250 return self._len<=1
251
252 - def __len__(self):
253 """ allows _len(cluster)_ to work 254 255 """ 256 return self._len
257
258 - def __cmp__(self,other):
259 """ allows _cluster1 == cluster2_ to work 260 261 """ 262 if cmp(type(self),type(other)): 263 return cmp(type(self),type(other)) 264 265 m1,m2=self.GetMetric(),other.GetMetric() 266 if abs(m1-m2)>CMPTOL: 267 return cmp(m1,m2) 268 269 if cmp(self.GetName(),other.GetName()): 270 return cmp(self.GetName(),other.GetName()) 271 272 c1,c2=self.GetChildren(),other.GetChildren() 273 return cmp(c1,c2)
274 275 276 if __name__ == '__main__': 277 from rdkit.ML.Cluster import ClusterUtils 278 root = Cluster(index=1,metric=1000) 279 c1 = Cluster(index=10,metric=100) 280 c1.AddChild(Cluster(index=30,metric=10)) 281 c1.AddChild(Cluster(index=31,metric=10)) 282 c1.AddChild(Cluster(index=32,metric=10)) 283 284 c2 = Cluster(index=11,metric=100) 285 c2.AddChild(Cluster(index=40,metric=10)) 286 c2.AddChild(Cluster(index=41,metric=10)) 287 288 root.AddChild(c1) 289 root.AddChild(c2) 290 291 nodes = ClusterUtils.GetNodeList(root) 292 293 indices = [x.GetIndex() for x in nodes] 294 print 'XXX:',indices 295