Package rdkit :: Package ML :: Package Cluster :: Module ClusterUtils
[hide private]
[frames] | no frames]

Source Code for Module rdkit.ML.Cluster.ClusterUtils

  1  # $Id: ClusterUtils.py 997 2009-02-25 06:12:43Z glandrum $ 
  2  # 
  3  # Copyright (C) 2001-2008  greg Landrum 
  4  # 
  5  #   @@ All Rights Reserved  @@ 
  6  # 
  7  """utility functions for clustering 
  8   
  9  """ 
 10   
11 -def GetNodeList(cluster):
12 """returns an ordered list of all nodes below cluster 13 14 the ordering is done using the lengths of the child nodes 15 16 **Arguments** 17 18 - cluster: the cluster in question 19 20 **Returns** 21 22 - a list of the leaves below this cluster 23 24 """ 25 if len(cluster) == 1: 26 return [cluster] 27 else: 28 children = cluster.GetChildren() 29 children.sort(lambda x,y:cmp(len(y),len(x))) 30 res = [] 31 for child in children: 32 res += GetNodeList(child) 33 res += [cluster] 34 return res
35
36 -def GetNodesDownToCentroids(cluster,above=1):
37 """returns an ordered list of all nodes below cluster 38 39 40 """ 41 if hasattr(cluster,'_isCentroid'): 42 cluster._aboveCentroid = 0 43 above = -1 44 else: 45 cluster._aboveCentroid = above 46 if len(cluster) == 1: 47 return [cluster] 48 else: 49 res = [] 50 children = cluster.GetChildren() 51 children.sort(lambda x,y:cmp(len(y),len(x))) 52 for child in children: 53 res = res + GetNodesDownToCentroids(child,above) 54 res = res + [cluster] 55 return res
56
57 -def FindClusterCentroidFromDists(cluster,dists):
58 """ find the point in a cluster which has the smallest summed 59 Euclidean distance to all others 60 61 **Arguments** 62 63 - cluster: the cluster to work with 64 65 - dists: the distance matrix to use for the points 66 67 **Returns** 68 69 - the index of the centroid point 70 71 """ 72 children = cluster.GetPoints() 73 pts = [x.GetData() for x in children] 74 75 best = 1e24 76 bestIdx = -1 77 for pt in pts: 78 dAccum = 0.0 79 # loop over others and add'em up 80 for other in pts: 81 if other != pt: 82 if other > pt: row,col = pt,other 83 else: row,col = other,pt 84 dAccum += dists[col*(col-1)/2+row] 85 if dAccum >= best: 86 # minor efficiency hack 87 break 88 if dAccum < best: 89 best = dAccum 90 bestIdx = pt 91 for i in range(len(pts)): 92 pt = pts[i] 93 if pt != bestIdx: 94 if pt > bestIdx: row,col = bestIdx,pt 95 else: row,col = pt,bestIdx 96 children[i]._distToCenter = dists[col*(col-1)/2+row] 97 else: 98 children[i]._distToCenter = 0.0 99 children[i]._clustCenter = bestIdx 100 cluster._clustCenter=bestIdx 101 cluster._distToCenter=0.0 102 103 return bestIdx
104
105 -def _BreadthFirstSplit(cluster,n):
106 """ *Internal Use Only* 107 108 """ 109 if len(cluster)<n: 110 raise ValueError,'Cannot split cluster of length %d into %d pieces'%(len(cluster),n) 111 if len(cluster)==n: 112 return cluster.GetPoints() 113 clusters = [cluster] 114 nxtIdx = 0 115 for i in range(n-1): 116 while nxtIdx<len(clusters) and len(clusters[nxtIdx])==1: 117 nxtIdx+=1 118 assert nxtIdx<len(clusters) 119 120 children = clusters[nxtIdx].GetChildren() 121 children.sort(_sortHelper) 122 for child in children: 123 clusters.append(child) 124 del clusters[nxtIdx] 125 return clusters
126
127 -def _sortHelper(ca,cb):
128 """ *Internal Use Only* 129 130 Used to sort cluster nodes 131 132 """ 133 return cmp(cb.GetMetric(),ca.GetMetric())
134
135 -def _HeightFirstSplit(cluster,n):
136 """ *Internal Use Only* 137 138 """ 139 if len(cluster)<n: 140 raise ValueError,'Cannot split cluster of length %d into %d pieces'%(len(cluster),n) 141 if len(cluster)==n: 142 return cluster.GetPoints() 143 clusters = [cluster] 144 for i in range(n-1): 145 nxtIdx = 0 146 while nxtIdx<len(clusters) and len(clusters[nxtIdx])==1: 147 nxtIdx+=1 148 assert nxtIdx<len(clusters) 149 150 children = clusters[nxtIdx].GetChildren() 151 for child in children: 152 clusters.append(child) 153 del clusters[nxtIdx] 154 clusters.sort(_sortHelper) 155 return clusters
156 157
158 -def SplitIntoNClusters(cluster,n,breadthFirst=1):
159 """ splits a cluster tree into a set of branches 160 161 **Arguments** 162 163 - cluster: the root of the cluster tree 164 165 - n: the number of clusters to include in the split 166 167 - breadthFirst: toggles breadth first (vs depth first) cleavage 168 of the cluster tree. 169 170 **Returns** 171 172 - a list of sub clusters 173 174 """ 175 if breadthFirst: 176 return _BreadthFirstSplit(cluster,n) 177 else: 178 return _HeightFirstSplit(cluster,n)
179