Package rdkit :: Package ML :: Package InfoTheory :: Module BitClusterer
[hide private]
[frames] | no frames]

Source Code for Module rdkit.ML.InfoTheory.BitClusterer

 1  # 
 2  #  Copyright (C) 2000-2008  Greg Landrum and Rational Discovery LLC 
 3  # 
 4   
 5  from rdkit.SimDivFilters import rdSimDivPickers as rdsimdiv 
 6  from rdkit import DataStructs 
 7  import numpy 
 8   
9 -class BitClusterer(object):
10 """ Class to cluster a set of bits based on their correllation 11 12 The correlation matrix is first built using by reading the fingerprints 13 from a database or a list of fingerprints 14 """ 15
16 - def __init__(self, idList, nCluster, type=rdsimdiv.ClusterMethod.WARD):
17 self._clusters = [] 18 self._bidList = idList 19 #self._matGen = BitCorrelationMatGenerator(idList) 20 self._nClusters = nCluster 21 self._type = type
22
23 - def ClusterBits(self, corrMat) :
24 # clutering code actually needs distances so, take 1/val for each element in corMat 25 distMat = 1/corrMat 26 27 pkr = rdsimdiv.HierarchicalClusterPicker(self._type) 28 29 cls = pkr.Cluster(distMat, len(self._bidList), self._nClusters) 30 # map the clusters to the actual bit ids 31 self._clusters = [] 32 for cl in cls : 33 bcls = [] 34 for i in cl : 35 bid = self._bidList[i] 36 bcls.append(bid) 37 self._clusters.append(bcls)
38
39 - def SetClusters(self, clusters):
40 assert len(clusters) == self._nClusters 41 self._clusters = clusters
42
43 - def GetClusters(self) :
44 return self._clusters
45
46 - def MapToClusterScores(self, fp) :
47 """ Map the fingerprint to a real valued vector of score based on the bit clusters 48 49 The dimension of the vector is same as the number of clusters. Each value in the 50 vector corresponds to the number of bits in the corresponding cluster 51 that are turned on in the fingerprint 52 53 ARGUMENTS: 54 - fp : the fingerprint 55 """ 56 57 scores = [0]*self._nClusters 58 59 i = 0 60 for cls in self._clusters: 61 for bid in cls : 62 if fp[bid] : 63 scores[i] += 1 64 65 i += 1 66 67 return scores
68
69 - def MapToClusterFP(self, fp) :
70 """ Map the fingerprint to a smaller sized (= number of clusters) fingerprint 71 72 Each cluster get a bit in the new fingerprint and is turned on if any of the bits in 73 the cluster are turned on in the original fingerprint""" 74 75 ebv = DataStructs.ExplicitBitVect(self._nClusters) 76 i = 0 77 78 for cls in self._clusters: 79 for bid in cls : 80 if fp[bid] : 81 ebv.SetBit(i) 82 break 83 i += 1 84 85 return ebv
86