1
2
3
4
5 from Numeric import *
6 from DataStructs.TopNContainer import TopNContainer
7 import SVDPack
8 import PySVD
9
11 for row in mat:
12 for col in row:
13 print '% 6.3f'%col,
14 print
15
16
17 """
18 throughout we use the notation of:
19 Deerwester et al. J. Am. Soc. Inf. Sci. 41 391-407 (1990)
20
21 """
24 self._vects = None
25 self._idMap = None
26 self._tForm = None
27 self._DS = None
28 self._T = None
29
31 """
32
33 vects is a sequence of *sorted* sequences of bit IDs
34
35
36 >>> calc = SimilarityCalculator()
37 >>> calc.SetVects( ((1,2),(3,100),(1,2,2,4)) )
38 >>> calc._vects
39 ((0, 1), (2, 4), (0, 1, 3))
40 >>> calc._vals
41 (1, 1, 1, 1, 1, 2, 1)
42 >>> calc._idMap[100]
43 4
44 >>> calc._idMap[4]
45 3
46
47 """
48 self._idMap = {}
49 self._vects = []
50 self._vals = []
51 self._tForm = None
52 self._DS = None
53 self._T = None
54
55 tmpD = {}
56 for vect in vects:
57 for bit in vect:
58 if not tmpD.has_key(bit):
59 tmpD[bit] = 1
60 ks = tmpD.keys()
61 ks.sort()
62 for i in range(len(ks)):
63 self._idMap[ks[i]] = i
64 ks = None
65 tmpD = None
66
67 for vect in vects:
68 tmp = []
69 i = 0
70 nBits = len(vect)
71 while i<nBits:
72 bit = vect[i]
73
74
75
76
77
78
79 idx = self._idMap.get(bit)
80 if idx < 0:
81 idx = len(self._idMap)
82 self._idMap[bit] = idx
83
84 tmp.append(idx)
85
86 self._vals.append(1)
87
88 j=i+1
89 while(j<nBits and vect[j]==vect[i]):
90 self._vals[-1] += 1
91 j += 1
92 i=j
93
94
95
96 self._vects.append(tuple(tmp))
97 self._vects = tuple(self._vects)
98 self._vals = tuple(self._vals)
99
101 """
102 >>> calc = SimilarityCalculator()
103 >>> try:
104 ... calc.UpdateSingularValues()
105 ... except ValueError:
106 ... ok=1
107 ... else:
108 ... ok=0
109 >>> ok
110 1
111 >>> calc.SetVects( ((0,2),(1,3),(0,1,2)) )
112 >>> calc.UpdateSingularValues()
113 >>> calc._S.shape[0]
114 3
115
116 Unless the optional cleanup argument is unset,the local vects
117 (untransformed data points) will be destroyed after we're done
118 with them here. This can save significant memory:
119 >>> try:
120 ... calc.UpdateSingularValues(2)
121 ... except ValueError:
122 ... ok=1
123 ... else:
124 ... ok=0
125 >>> ok
126 1
127
128 Have to call SetVects again:
129 >>> calc.SetVects( ((0,2),(1,3),(0,1,2)) )
130 >>> calc.UpdateSingularValues(2)
131 >>> calc._S.shape[0]
132 2
133 >>> print '%.4f'%calc._S[0]
134 2.1889
135 >>> print '%.4f'%calc._S[1]
136 1.4142
137
138 """
139 if not self._vects:
140 raise ValueError,"SetVects() not called"
141
142 nRows = len(self._vects)
143 nCols = len(self._idMap)
144 if k==-1:
145 k = min(nRows,nCols)
146
147
148
149 D,s,T = PySVD.SparseSVD(self._vects,self._vals,nRows,nCols,k,1)
150 T = transpose(T)
151 D = transpose(D)
152
153
154 k = min(k,len(s))
155 self._S = s[:k]
156
157 self._T = T
158 self._DS = D*s
159
160 if 0:
161 print 'T:'
162 showMat(self._T)
163
164 print 'S:'
165 print s
166
167 print 'D:'
168 showMat(D)
169
170
171 print 'DS:'
172 showMat(self._DS)
173
174 print '------------------------'
175 showMat(matrixmultiply(transpose(T),T))
176
177 print '------------------------'
178 showMat(matrixmultiply(transpose(D),D))
179
180 print '------------------------'
181 print self._T.shape,self._DS.shape
182 showMat(matrixmultiply(self._T,transpose(self._DS)))
183
184
185
186 if cleanup:
187 self._vects = None
188 self._vals = None
189
191 """
192
193
194 >>> calc = SimilarityCalculator()
195 >>> try:
196 ... calc.UpdateSingularValues()
197 ... except ValueError:
198 ... ok=1
199 ... else:
200 ... ok=0
201 >>> ok
202 1
203 >>> calc.SetVects( ((0,2),(1,3),(0,1,2)) )
204 >>> calc.UpdateSingularValues()
205 >>> calc._S.shape[0]
206 3
207
208 Unless the optional cleanup argument is unset,the local vects
209 (untransformed data points) will be destroyed after we're done
210 with them here. This can save significant memory:
211 >>> try:
212 ... calc.UpdateSingularValues(2)
213 ... except ValueError:
214 ... ok=1
215 ... else:
216 ... ok=0
217 >>> ok
218 1
219
220 Have to call SetVects again:
221 >>> calc.SetVects( ((0,2),(1,3),(0,1,2)) )
222 >>> calc.UpdateSingularValues(2)
223 >>> calc._S.shape[0]
224 2
225 >>> print '%.4f'%calc._S[0]
226 2.1889
227 >>> print '%.4f'%calc._S[1]
228 1.4142
229
230 """
231 if not self._vects:
232 raise ValueError,"SetVects() not called"
233
234 nRows = len(self._vects)
235 nCols = len(self._idMap)
236 if k==-1:
237 k = min(nRows,nCols)
238
239 SVDPack.DoSVD(self,k)
240
241
242 if cleanup:
243 self._vects = None
244 self._vals = None
246 k = min(k,len(s))
247 self._S = s[:k]
248 self._T = T
249 self._DS = D*s
250 if cleanup:
251 self._vects = None
252 self._vals = None
253
254
255
256
258 """
259
260 converts a point from the normal space to the reduced (packed) space
261
262 >>> calc = SimilarityCalculator()
263 >>> calc.SetVects( ((1,2),(3,100),(1,2,2,4)) )
264 >>> calc.PackPoint( (1,2) )
265 array([ 1., 1., 0., 0., 0.])
266 >>> calc.PackPoint( (1,2,2) )
267 array([ 1., 2., 0., 0., 0.])
268 >>> calc.PackPoint( (1,2,5) )
269 array([ 1., 1., 0., 0., 0.])
270
271 """
272 if not self._idMap:
273 raise ValueError,"SetVects() not called"
274 res = zeros((len(self._idMap),),Float)
275 for val in pt:
276 idx = self._idMap.get(val,-1)
277 if idx>=0:
278 res[idx] += 1
279 return res
280
307
308 - def ScorePoint(self,pt,against=None,isTransformed=0,topN=0,
309 threshold=-1.0,
310 excludeThese=[]):
311 """
312
313 return value is a sequence of 2-tuples: (score, index)
314
315 >>> calc = SimilarityCalculator()
316 >>> calc.SetVects( ((0,2),(1,3),(0,1,2)) )
317 >>> calc.UpdateSingularValues(k=3)
318 >>> r = calc.ScorePoint((0,2),against=[0])[0]
319 >>> print '%.2f'%r[0], r[1]
320 1.00 0
321
322 can transform the point in advance:
323 >>> pt = calc.TransformPoint( (0,2) )
324 >>> r = calc.ScorePoint(pt,against=[0],isTransformed=1)[0]
325 >>> print '%.2f'%r[0], r[1]
326 1.00 0
327
328 default is to score against a variety of vectors at once:
329 >>> [abs(x[0])>1e-4 for x in calc.ScorePoint(pt,isTransformed=1)]
330 [True, False, True]
331
332 >>> [abs(x[0])>1e-4 for x in calc.ScorePoint((0,3,6))]
333 [True, True, True]
334
335 >>> [abs(x[0])>1e-4 for x in calc.ScorePoint((0,3,6),topN=2)]
336 [True, True]
337
338 you can also put a threshold on the similarity metric:
339 >>> len(calc.ScorePoint((0,3,6)))
340 3
341 >>> len(calc.ScorePoint((0,3,6),threshold=0.50))
342 2
343
344
345
346 "extra" bits (those that weren't in the training vectors) are
347 ignored:
348 >>> [abs(x[0])>1e-4 for x in calc.ScorePoint((0,2,12))]
349 [True, False, True]
350
351 # look at the indices:
352 >>> v = [x[1] for x in calc.ScorePoint((0,3,6),topN=2)]
353 >>> v.sort()
354 >>> v
355 [0, 1]
356 >>> [x[1] for x in calc.ScorePoint((0,3,6),topN=2,excludeThese=[1])]
357 [2, 0]
358
359
360 """
361 if not self._T or not self._DS:
362 raise ValueError,"UpdateSingularValues() not called"
363 if not isTransformed:
364 pt = self.TransformPoint(pt)
365 if against is None:
366 against = range(self._DS.shape[0])
367 try:
368 nPts = len(against)
369 except AttributeError:
370 against = [against]
371 nPts =1
372 if topN:
373 res = TopNContainer(topN)
374 else:
375 res = []
376 ptSize = sqrt(dot(pt,pt))
377
378 for idx in excludeThese:
379 try:
380 against.remove(idx)
381 except ValueError:
382 pass
383
384
385 for i in against:
386 v = self._DS[i]
387 vSize = sqrt(dot(v,v))
388 numer = dot(v,pt)
389 denom = vSize*ptSize
390
391 if denom != 0.0:
392 simVal = numer/denom
393 else:
394 simVal = 0.0
395 if simVal>threshold:
396 if not topN:
397 res.append((simVal,i))
398 else:
399 res.Insert(simVal,i)
400 return res
401
402
403
404
405
406 molTest="""
407
408 This is a nice test because not only is it molecular with vaguely
409 sensible results, but the matrix is only of rank 2 (despite being
410 3x3), so it can catch boundary conditions in the solver.
411
412 >>> import Chem
413 >>> from Chem.AtomPairs import Torsions,Utils
414 >>> m1 = Chem.MolFromSmiles('CCN(CCO)CC')
415 >>> m2 = Chem.MolFromSmiles('CCN(CCO)CCO')
416 >>> m3 = Chem.MolFromSmiles('OCCN(CCO)CCO')
417 >>> fp1 = Torsions.GetTopologicalTorsionFingerprintAsIds(m1)
418 >>> print fp1
419 >>> fp2 = Torsions.GetTopologicalTorsionFingerprintAsIds(m2)
420 >>> fp3 = Torsions.GetTopologicalTorsionFingerprintAsIds(m3)
421 >>> calc = SimilarityCalculator()
422 >>> calc.SetVects((fp1,fp2,fp3))
423 >>> calc.UpdateSingularValues()
424 >>> scores = calc.ScorePoint(fp1)
425 >>> print '%.3f'%scores[0][0],scores[0][1]
426 1.000 0
427 >>> print '%.3f'%scores[1][0],scores[1][1]
428 0.802 1
429 >>> print '%.3f'%scores[2][0],scores[2][1]
430 0.488 2
431 >>> scores = calc.ScorePoint(fp2)
432 >>> print '%.3f'%scores[0][0],scores[0][1]
433 0.802 0
434 >>> print '%.3f'%scores[1][0],scores[1][1]
435 1.000 1
436 >>> print '%.3f'%scores[2][0],scores[2][1]
437 0.913 2
438 >>> scores = calc.ScorePoint(fp3)
439 >>> print '%.3f'%scores[0][0],scores[0][1]
440 0.488 0
441 >>> print '%.3f'%scores[1][0],scores[1][1]
442 0.913 1
443 >>> print '%.3f'%scores[2][0],scores[2][1]
444 1.000 2
445 >>> scores = calc.ScorePoint(fp1,topN=2)
446 >>> scores.reverse()
447 >>> print '%.3f'%scores[0][0],scores[0][1]
448 1.000 0
449 >>> print '%.3f'%scores[1][0],scores[1][1]
450 0.802 1
451
452
453 """
454
455 __test__={'molTest':molTest}
456
458 import doctest,sys
459 return doctest.testmod(sys.modules["__main__"])
460
461
462 if __name__ == '__main__':
463 import sys
464 failed,tried = _test()
465 sys.exit(failed)
466