1
2
3
4
5
6
7 """ utility functionality for clustering molecules using fingerprints
8 includes a command line app for clustering
9
10
11 Sample Usage:
12 python ClusterMols.py -d data.gdb -t daylight_sig \
13 --idName="CAS_TF" -o clust1.pkl \
14 --actTable="dop_test" --actName="moa_quant"
15
16 """
17 from rdkit.Dbase.DbConnection import DbConnect
18 from rdkit.Dbase import DbInfo,DbUtils
19 from rdkit.ML.Data import DataUtils
20 from rdkit.ML.Cluster import Clusters
21 from rdkit.ML.Cluster import Murtagh
22 import sys,cPickle
23 from rdkit.Chem.Fingerprints import FingerprintMols,MolSimilarity
24 from rdkit import DataStructs
25 import numpy
26 _cvsVersion="$Id: ClusterMols.py 997 2009-02-25 06:12:43Z glandrum $"
27 idx1 = _cvsVersion.find(':')+1
28 idx2 = _cvsVersion.rfind('$')
29 __VERSION_STRING="%s"%(_cvsVersion[idx1:idx2])
30
31 message=FingerprintMols.message
32 error=FingerprintMols.error
33
35 """ data should be a list of tuples with fingerprints in position 1
36 (the rest of the elements of the tuple are not important)
37
38 Returns the symmetric distance matrix
39 (see ML.Cluster.Resemblance for layout documentation)
40
41 """
42 nPts = len(data)
43 res = numpy.zeros((nPts*(nPts-1)/2),numpy.float)
44 nSoFar=0
45 for col in xrange(1,nPts):
46 for row in xrange(col):
47 fp1 = data[col][1]
48 fp2 = data[row][1]
49 if fp1.GetNumBits()>fp2.GetNumBits():
50 fp1 = DataStructs.FoldFingerprint(fp1,fp1.GetNumBits()/fp2.GetNumBits())
51 elif fp2.GetNumBits()>fp1.GetNumBits():
52 fp2 = DataStructs.FoldFingerprint(fp2,fp2.GetNumBits()/fp1.GetNumBits())
53 sim = metric(fp1,fp2)
54 if isSimilarity:
55 sim = 1.-sim
56 res[nSoFar] = sim
57 nSoFar += 1
58 return res
59
60 -def ClusterPoints(data,metric,algorithmId,haveLabels=False,haveActs=True,returnDistances=False):
61 message('Generating distance matrix.\n')
62 dMat = GetDistanceMatrix(data,metric)
63 message('Clustering\n')
64 clustTree = Murtagh.ClusterData(dMat,len(data),algorithmId,
65 isDistData=1)[0]
66 acts = []
67 if haveActs and len(data[0])>2:
68
69 acts = [int(x[2]) for x in data]
70
71 if not haveLabels:
72 labels = ['Mol: %s'%str(x[0]) for x in data]
73 else:
74 labels = [x[0] for x in data]
75 clustTree._ptLabels = labels
76 if acts:
77 clustTree._ptValues = acts
78 for pt in clustTree.GetPoints():
79 idx = pt.GetIndex()-1
80 pt.SetName(labels[idx])
81 if acts:
82 try:
83 pt.SetData(int(acts[idx]))
84 except:
85 pass
86 if not returnDistances:
87 return clustTree
88 else:
89 return clustTree,dMat
90
115
116 _usageDoc="""
117 Usage: ClusterMols.py [args] <fName>
118
119 If <fName> is provided and no tableName is specified (see below),
120 data will be read from the text file <fName>. Text files delimited
121 with either commas (extension .csv) or tabs (extension .txt) are
122 supported.
123
124 Command line arguments are:
125
126 - -d _dbName_: set the name of the database from which
127 to pull input fingerprint information.
128
129 - -t _tableName_: set the name of the database table
130 from which to pull input fingerprint information
131
132 - --idName=val: sets the name of the id column in the input
133 database. Default is *ID*.
134
135 - -o _outFileName_: name of the output file (output will
136 be a pickle (.pkl) file with the cluster tree)
137
138 - --actTable=val: name of table containing activity values
139 (used to color points in the cluster tree).
140
141 - --actName=val: name of column with activities in the activity
142 table. The values in this column should either be integers or
143 convertible into integers.
144
145 - --SLINK: use the single-linkage clustering algorithm
146 (default is Ward's minimum variance)
147
148 - --CLINK: use the complete-linkage clustering algorithm
149 (default is Ward's minimum variance)
150
151 - --UPGMA: use the group-average clustering algorithm
152 (default is Ward's minimum variance)
153
154 - --dice: use the DICE similarity metric instead of Tanimoto
155
156 - --cosine: use the cosine similarity metric instead of Tanimoto
157
158 - --fpColName=val: name to use for the column which stores
159 fingerprints (in pickled format) in the input db table.
160 Default is *AutoFragmentFP*
161
162 - --minPath=val: minimum path length to be included in
163 fragment-based fingerprints. Default is *2*.
164
165 - --maxPath=val: maximum path length to be included in
166 fragment-based fingerprints. Default is *7*.
167
168 - --nBitsPerHash: number of bits to be set in the output
169 fingerprint for each fragment. Default is *4*.
170
171 - --discrim: use of path-based discriminators to hash bits.
172 Default is *false*.
173
174 - -V: include valence information in the fingerprints
175 Default is *false*.
176
177 - -H: include Hs in the fingerprint
178 Default is *false*.
179
180 - --useMACCS: use the public MACCS keys to do the fingerprinting
181 (instead of a daylight-type fingerprint)
182
183
184 """
185 if __name__ == '__main__':
186 message("This is ClusterMols version %s\n\n"%(__VERSION_STRING))
187 FingerprintMols._usageDoc=_usageDoc
188 details = FingerprintMols.ParseArgs()
189 ClusterFromDetails(details)
190