1
2
3
4
5
6
7 """ Interface to the C++ Murtagh hierarchic clustering code
8
9 """
10 from rdkit.ML.Cluster import Clusters
11 from rdkit.ML.Cluster.Clustering import MurtaghCluster,MurtaghDistCluster
12 import numpy
13
14
15
16 WARDS=1
17 SLINK=2
18 CLINK=3
19 UPGMA=4
20 MCQUITTY=5
21 GOWER=6
22 CENTROID=7
23
24
25 methods = [
26 ("Ward's Minimum Variance",WARDS,"Ward's Minimum Variance"),
27 ('Average Linkage',UPGMA,'Group Average Linkage (UPGMA)'),
28 ('Single Linkage',SLINK,'Single Linkage (SLINK)'),
29 ('Complete Linkage',CLINK,'Complete Linkage (CLINK)'),
30
31
32 ("Centroid",CENTROID,"Centroid method"),
33 ]
34
36 """ *Internal Use Only*
37
38 returns the distance between points i and j in the symmetric
39 distance matrix _dists_
40
41 """
42 if i==j: return 0.0
43 if i > j: i,j=j,i
44 return dists[j*(j-1)/2+i]
45
46
47
49 """ *Internal Use Only*
50
51 Converts the results of the Murtagh clustering code into
52 a cluster tree, which is returned in a single-entry list
53
54 """
55 cs = [None]*nPts
56 for i in range(nPts):
57 cs[i] = Clusters.Cluster(metric=0.0,data=i,index=(i+1))
58
59 nClus = len(ia)-1
60 for i in range(nClus):
61 idx1 = ia[i]-1
62 idx2 = ib[i]-1
63 c1 = cs[idx1]
64 c2 = cs[idx2]
65 newClust = Clusters.Cluster(metric=crit[i],children=[c1,c2],
66 index=nPts+i+1)
67 cs[idx1] = newClust
68
69 return [newClust]
70
71
73 """ clusters the data points passed in and returns the cluster tree
74
75 **Arguments**
76
77 - data: a list of lists (or array, or whatever) with the input
78 data (see discussion of _isDistData_ argument for the exception)
79
80 - nPts: the number of points to be used
81
82 - method: determines which clustering algorithm should be used.
83 The defined constants for these are:
84 'WARDS, SLINK, CLINK, UPGMA'
85
86 - isDistData: set this toggle when the data passed in is a
87 distance matrix. The distance matrix should be stored
88 symmetrically so that _LookupDist (above) can retrieve
89 the results:
90 for i<j: d_ij = dists[j*(j-1)/2 + i]
91
92
93 **Returns**
94
95 - a single entry list with the cluster tree
96 """
97 data = numpy.array(data)
98 if not isDistData:
99 sz = data.shape[1]
100 ia,ib,crit = MurtaghCluster(data,nPts,sz,method)
101 else:
102 ia,ib,crit = MurtaghDistCluster(data,nPts,method)
103 c = _ToClusters(data,nPts,ia,ib,crit,isDistData=isDistData)
104
105 return c
106