Package ML :: Package Cluster :: Module ClusterUtils
[hide private]
[frames] | no frames]

Source Code for Module ML.Cluster.ClusterUtils

  1  # $Id: ClusterUtils.py 2 2006-05-06 22:54:39Z glandrum $ 
  2  # 
  3  # Copyright (C) 2001-2006  greg Landrum 
  4  # 
  5  #   @@ All Rights Reserved  @@ 
  6  # 
  7  """utility functions for clustering 
  8   
  9  """ 
 10  from Numeric import * 
 11   
12 -def GetNodeList(cluster):
13 """returns an ordered list of all nodes below cluster 14 15 the ordering is done using the lengths of the child nodes 16 17 **Arguments** 18 19 - cluster: the cluster in question 20 21 **Returns** 22 23 - a list of the leaves below this cluster 24 25 """ 26 if len(cluster) == 1: 27 return [cluster] 28 else: 29 res = [] 30 children = cluster.GetChildren() 31 children.sort(lambda x,y:cmp(len(y),len(x))) 32 for child in children: 33 res += GetNodeList(child) 34 res += [cluster] 35 return res
36
37 -def GetNodesDownToCentroids(cluster,above=1):
38 """returns an ordered list of all nodes below cluster 39 40 41 """ 42 if hasattr(cluster,'_isCentroid'): 43 cluster._aboveCentroid = 0 44 above = -1 45 else: 46 cluster._aboveCentroid = above 47 if len(cluster) == 1: 48 return [cluster] 49 else: 50 res = [] 51 children = cluster.GetChildren() 52 children.sort(lambda x,y:cmp(len(y),len(x))) 53 for child in children: 54 res = res + GetNodesDownToCentroids(child,above) 55 res = res + [cluster] 56 return res
57
58 -def FindClusterCentroidFromDists(cluster,dists):
59 """ find the point in a cluster which has the smallest summed 60 Euclidean distance to all others 61 62 **Arguments** 63 64 - cluster: the cluster to work with 65 66 - dists: the distance matrix to use for the points 67 68 **Returns** 69 70 - the index of the centroid point 71 72 """ 73 children = cluster.GetPoints() 74 pts = map(lambda x:x.GetData(),children) 75 76 best = 1e24 77 bestIdx = -1 78 for pt in pts: 79 dAccum = 0.0 80 # loop over others and add'em up 81 for other in pts: 82 if other != pt: 83 if other > pt: row,col = pt,other 84 else: row,col = other,pt 85 dAccum += dists[col*(col-1)/2+row] 86 if dAccum >= best: 87 # minor efficiency hack 88 break 89 if dAccum < best: 90 best = dAccum 91 bestIdx = pt 92 for i in range(len(pts)): 93 pt = pts[i] 94 if pt != bestIdx: 95 if pt > bestIdx: row,col = bestIdx,pt 96 else: row,col = pt,bestIdx 97 children[i]._distToCenter = dists[col*(col-1)/2+row] 98 else: 99 children[i]._distToCenter = 0.0 100 children[i]._clustCenter = bestIdx 101 cluster._clustCenter=bestIdx 102 cluster._distToCenter=0.0 103 104 return bestIdx
105
106 -def _BreadthFirstSplit(cluster,n):
107 """ *Internal Use Only* 108 109 """ 110 if len(cluster)<n: 111 raise ValueError,'Cannot split cluster of length %d into %d pieces'%(len(cluster),n) 112 if len(cluster)==n: 113 return cluster.GetPoints() 114 clusters = [cluster] 115 nxtIdx = 0 116 for i in range(n-1): 117 while nxtIdx<len(clusters) and len(clusters[nxtIdx])==1: 118 nxtIdx+=1 119 assert nxtIdx<len(clusters) 120 121 children = clusters[nxtIdx].GetChildren() 122 children.sort(_sortHelper) 123 for child in children: 124 clusters.append(child) 125 del clusters[nxtIdx] 126 return clusters
127
128 -def _sortHelper(ca,cb):
129 """ *Internal Use Only* 130 131 Used to sort cluster nodes 132 133 """ 134 return cmp(cb.GetMetric(),ca.GetMetric())
135
136 -def _HeightFirstSplit(cluster,n):
137 """ *Internal Use Only* 138 139 """ 140 if len(cluster)<n: 141 raise ValueError,'Cannot split cluster of length %d into %d pieces'%(len(cluster),n) 142 if len(cluster)==n: 143 return cluster.GetPoints() 144 clusters = [cluster] 145 for i in range(n-1): 146 nxtIdx = 0 147 while nxtIdx<len(clusters) and len(clusters[nxtIdx])==1: 148 nxtIdx+=1 149 assert nxtIdx<len(clusters) 150 151 children = clusters[nxtIdx].GetChildren() 152 for child in children: 153 clusters.append(child) 154 del clusters[nxtIdx] 155 clusters.sort(_sortHelper) 156 return clusters
157 158
159 -def SplitIntoNClusters(cluster,n,breadthFirst=1):
160 """ splits a cluster tree into a set of branches 161 162 **Arguments** 163 164 - cluster: the root of the cluster tree 165 166 - n: the number of clusters to include in the split 167 168 - breadthFirst: toggles breadth first (vs depth first) cleavage 169 of the cluster tree. 170 171 **Returns** 172 173 - a list of sub clusters 174 175 """ 176 if breadthFirst: 177 return _BreadthFirstSplit(cluster,n) 178 else: 179 return _HeightFirstSplit(cluster,n)
180