1
2
3
4
5
6
7 """ utility functionality for molecular similarity
8 includes a command line app for screening databases
9
10
11 Sample Usage:
12
13 python MolSimilarity.py -d data.gdb -t daylight_sig --idName="Mol_ID" \
14 --topN=100 --smiles='c1(C=O)ccc(Oc2ccccc2)cc1' --smilesTable=raw_dop_data \
15 --smilesName="structure" -o results.csv
16
17 """
18 from rdkit import RDConfig
19 from rdkit import DataStructs
20 from rdkit import Chem
21 from rdkit.Dbase.DbConnection import DbConnect
22 from rdkit.Dbase import DbModule
23 from rdkit.DataStructs.TopNContainer import TopNContainer
24 import sys,types
25 import cPickle
26 from rdkit.Chem.Fingerprints import FingerprintMols,DbFpSupplier
27 try:
28 from rdkit.VLib.NodeLib.DbPickleSupplier import _lazyDataSeq as _dataSeq
29 except ImportError:
30 _dataSeq=None
31
32
33 from rdkit import DataStructs
34
35 _cvsVersion="$Id: MolSimilarity.py 997 2009-02-25 06:12:43Z glandrum $"
36 idx1 = _cvsVersion.find(':')+1
37 idx2 = _cvsVersion.rfind('$')
38 __VERSION_STRING="%s"%(_cvsVersion[idx1:idx2])
39
40
42 fields = '%s.%s'%(details.tableName,details.idName)
43 join = ''
44 if details.smilesTableName:
45 if details.smilesName:
46 fields = fields + ',%s'%(details.smilesName)
47 join='join %s smi on smi.%s=%s.%s'%(details.smilesTableName,
48 details.idName,
49 details.tableName,
50 details.idName)
51 if details.actTableName:
52 if details.actName:
53 fields = fields + ',%s'%(details.actName)
54 join = join + 'join %s act on act.%s=%s.%s'%(details.actTableName,
55 details.idName,
56 details.tableName,
57 details.idName)
58
59 if extraFields:
60 fields += ','+extraFields
61 cmd = 'select %s from %s %s'%(fields,details.tableName,join)
62 return cmd
63
65 try:
66 probeFp = apply(FingerprintMols.FingerprintMol,(mol,),details.__dict__)
67 except:
68 import traceback
69 FingerprintMols.error('Error: problems fingerprinting molecule.\n')
70 traceback.print_exc()
71 return []
72 if details.dbName and details.tableName:
73 try:
74 conn = DbConnect(details.dbName,details.tableName)
75 if hasattr(details,'dbUser'):
76 conn.user = details.dbUser
77 if hasattr(details,'dbPassword'):
78 conn.password = details.dbPassword
79 except:
80 import traceback
81 FingerprintMols.error('Error: Problems establishing connection to database: %s|%s\n'%(details.dbName,
82 details.tableName))
83 traceback.print_exc()
84
85 if details.metric not in (DataStructs.TanimotoSimilarity,
86 DataStructs.DiceSimilarity,
87 DataStructs.CosineSimilarity):
88 data = GetFingerprints(details)
89 res = ScreenFingerprints(details,data,mol)
90 else:
91 res = []
92 if details.metric == DataStructs.TanimotoSimilarity:
93 func = 'rd_tanimoto'
94 pkl=probeFp.ToBitString()
95 elif details.metric == DataStructs.DiceSimilarity:
96 func = 'rd_dice'
97 pkl=probeFp.ToBitString()
98 elif details.metric == DataStructs.CosineSimilarity:
99 func = 'rd_cosine'
100 pkl=probeFp.ToBitString()
101 extraFields="%s(%s,%s) as tani"%(func,DbModule.placeHolder,details.fpColName)
102 cmd = _ConstructSQL(details,extraFields=extraFields)
103
104 if details.doThreshold:
105
106 cmd = "select * from (%s) tmp where tani>%f"%(cmd,details.screenThresh)
107 cmd += " order by tani desc"
108 if not details.doThreshold and details.topN>0:
109 cmd += " limit %d"%details.topN
110 curs = conn.GetCursor()
111 curs.execute(cmd,(pkl,))
112 res = curs.fetchall()
113
114 return res
115
117 """ returns an iterable sequence of fingerprints
118 each fingerprint will have a _fieldsFromDb member whose first entry is
119 the id.
120
121 """
122 if details.dbName and details.tableName:
123 try:
124 conn = DbConnect(details.dbName,details.tableName)
125 if hasattr(details,'dbUser'):
126 conn.user = details.dbUser
127 if hasattr(details,'dbPassword'):
128 conn.password = details.dbPassword
129 except:
130 import traceback
131 FingerprintMols.error('Error: Problems establishing connection to database: %s|%s\n'%(details.dbName,
132 details.tableName))
133 traceback.print_exc()
134 cmd = _ConstructSQL(details,extraFields=details.fpColName)
135 curs = conn.GetCursor()
136
137
138 if _dataSeq:
139 suppl = _dataSeq(curs,cmd,depickle=not details.noPickle,klass=DataStructs.ExplicitBitVect)
140 _dataSeq._conn = conn
141 else:
142 suppl = DbFpSupplier.ForwardDbFpSupplier(data,fpColName=details.fpColName)
143 elif details.inFileName:
144 conn = None
145 try:
146 inF = open(details.inFileName,'r')
147 except IOError:
148 import traceback
149 FingerprintMols.error('Error: Problems reading from file %s\n'%(details.inFileName))
150 traceback.print_exc()
151
152 supple = []
153 done = 0
154 while not done:
155 try:
156 id,fp = cPickle.load(inF)
157 except:
158 done = 1
159 else:
160 fp._fieldsFromDb = [id]
161 suppl.append(fp)
162 else:
163 suppl = None
164
165 return suppl
166
168 """ Returns a list of results
169
170 """
171 if probeFp is None:
172 try:
173 probeFp = apply(FingerprintMols.FingerprintMol,(mol,),details.__dict__)
174 except:
175 import traceback
176 FingerprintMols.error('Error: problems fingerprinting molecule.\n')
177 traceback.print_exc()
178 return []
179 if not probeFp:
180 return []
181
182 res = []
183 if not details.doThreshold and details.topN>0:
184 topN = TopNContainer(details.topN)
185 else:
186 topN = []
187 res = []
188 count = 0
189 for pt in data:
190 fp1 = probeFp
191 if not details.noPickle:
192 if type(pt) in (types.TupleType,types.ListType):
193 id,fp = pt
194 else:
195 fp = pt
196 id = pt._fieldsFromDb[0]
197 score = DataStructs.FingerprintSimilarity(fp1,fp,details.metric)
198 else:
199 id,pkl = pt
200 score = details.metric(fp1,str(pkl))
201 if topN:
202 topN.Insert(score,id)
203 elif not details.doThreshold or \
204 (details.doThreshold and score>=details.screenThresh):
205 res.append((id,score))
206 count += 1
207 if hasattr(details,'stopAfter') and count >= details.stopAfter:
208 break
209 for score,id in topN:
210 res.append((id,score))
211
212 return res
213
252
253 _usageDoc="""
254 Usage: MolSimilarity.py [args] <fName>
255
256 If <fName> is provided and no tableName is specified (see below),
257 data will be read from the pickled file <fName>. This file should
258 contain a series of pickled (id,fingerprint) tuples.
259
260 NOTE: at the moment the user is responsible for ensuring that the
261 fingerprint parameters given at run time (used to fingerprint the
262 probe molecule) match those used to generate the input fingerprints.
263
264 Command line arguments are:
265 - --smiles=val: sets the SMILES for the input molecule. This is
266 a required argument.
267
268 - -d _dbName_: set the name of the database from which
269 to pull input fingerprint information.
270
271 - -t _tableName_: set the name of the database table
272 from which to pull input fingerprint information
273
274 - --smilesTable=val: sets the name of the database table
275 which contains SMILES for the input fingerprints. If this
276 information is provided along with smilesName (see below),
277 the output file will contain SMILES data
278
279 - --smilesName=val: sets the name of the SMILES column
280 in the input database. Default is *SMILES*.
281
282 - --topN=val: sets the number of results to return.
283 Default is *10*.
284
285 - --thresh=val: sets the similarity threshold.
286
287 - --idName=val: sets the name of the id column in the input
288 database. Default is *ID*.
289
290 - -o _outFileName_: name of the output file (output will
291 be a CSV file with one line for each of the output molecules
292
293 - --dice: use the DICE similarity metric instead of Tanimoto
294
295 - --cosine: use the cosine similarity metric instead of Tanimoto
296
297 - --fpColName=val: name to use for the column which stores
298 fingerprints (in pickled format) in the output db table.
299 Default is *AutoFragmentFP*
300
301 - --minPath=val: minimum path length to be included in
302 fragment-based fingerprints. Default is *1*.
303
304 - --maxPath=val: maximum path length to be included in
305 fragment-based fingerprints. Default is *7*.
306
307 - --nBitsPerHash: number of bits to be set in the output
308 fingerprint for each fragment. Default is *4*.
309
310 - --discrim: use of path-based discriminators to hash bits.
311 Default is *false*.
312
313 - -V: include valence information in the fingerprints
314 Default is *false*.
315
316 - -H: include Hs in the fingerprint
317 Default is *false*.
318
319 - --useMACCS: use the public MACCS keys to do the fingerprinting
320 (instead of a daylight-type fingerprint)
321
322
323 """
324 if __name__ == '__main__':
325 FingerprintMols.message("This is MolSimilarity version %s\n\n"%(__VERSION_STRING))
326 FingerprintMols._usageDoc=_usageDoc
327 details = FingerprintMols.ParseArgs()
328 ScreenFromDetails(details)
329