Package Chem :: Package Fingerprints :: Module ClusterMols
[hide private]
[frames] | no frames]

Source Code for Module Chem.Fingerprints.ClusterMols

  1  # $Id: ClusterMols.py 2 2006-05-06 22:54:39Z glandrum $
 
  2  #
 
  3  #  Copyright (c) 2003-2006 Rational Discovery LLC
 
  4  #
 
  5  #   @@ All Rights Reserved  @@
 
  6  #
 
  7  """ utility functionality for clustering molecules using fingerprints
 
  8   includes a command line app for clustering
 
  9  
 
 10  
 
 11  Sample Usage:
 
 12    python ClusterMols.py  -d data.gdb -t daylight_sig \
 
 13      --idName="CAS_TF" -o clust1.pkl \
 
 14      --actTable="dop_test" --actName="moa_quant"
 
 15  
 
 16  """ 
 17  from Dbase.DbConnection import DbConnect 
 18  from Dbase import DbInfo,DbUtils 
 19  from ML.Data import DataUtils 
 20  from ML.Cluster import Clusters 
 21  from ML.Cluster import Murtagh 
 22  import sys,cPickle 
 23  from Chem.Fingerprints import FingerprintMols,MolSimilarity 
 24  import DataStructs 
 25  from Numeric import * 
 26  _cvsVersion="$Id: ClusterMols.py 2 2006-05-06 22:54:39Z glandrum $" 
 27  idx1 = _cvsVersion.find(':')+1 
 28  idx2 = _cvsVersion.rfind('$') 
 29  __VERSION_STRING="%s"%(_cvsVersion[idx1:idx2]) 
 30  
 
 31  message=FingerprintMols.message 
 32  error=FingerprintMols.error 
 33  
 
34 -def GetDistanceMatrix(data,metric,isSimilarity=1):
35 """ data should be a list of tuples with fingerprints in position 1 36 (the rest of the elements of the tuple are not important) 37 38 Returns the symmetric distance matrix 39 (see ML.Cluster.Resemblance for layout documentation) 40 41 """ 42 nPts = len(data) 43 res = zeros((nPts*(nPts-1)/2),Float) 44 nSoFar=0 45 for col in xrange(1,nPts): 46 for row in xrange(col): 47 fp1 = data[col][1] 48 fp2 = data[row][1] 49 if fp1.GetNumBits()>fp2.GetNumBits(): 50 fp1 = DataStructs.FoldFingerprint(fp1,fp1.GetNumBits()/fp2.GetNumBits()) 51 elif fp2.GetNumBits()>fp1.GetNumBits(): 52 fp2 = DataStructs.FoldFingerprint(fp2,fp2.GetNumBits()/fp1.GetNumBits()) 53 sim = metric(fp1,fp2) 54 if isSimilarity: 55 sim = 1.-sim 56 res[nSoFar] = sim 57 nSoFar += 1 58 return res
59
60 -def ClusterPoints(data,metric,algorithmId,haveLabels=False,haveActs=True,returnDistances=False):
61 message('Generating distance matrix.\n') 62 dMat = GetDistanceMatrix(data,metric) 63 message('Clustering\n') 64 clustTree = Murtagh.ClusterData(dMat,len(data),algorithmId, 65 isDistData=1)[0] 66 acts = [] 67 if haveActs and len(data[0])>2: 68 # we've got activities... use them: 69 acts = [int(x[2]) for x in data] 70 71 if not haveLabels: 72 labels = ['Mol: %s'%str(x[0]) for x in data] 73 else: 74 labels = [x[0] for x in data] 75 clustTree._ptLabels = labels 76 if acts: 77 clustTree._ptValues = acts 78 for pt in clustTree.GetPoints(): 79 idx = pt.GetIndex()-1 80 pt.SetName(labels[idx]) 81 if acts: 82 try: 83 pt.SetData(int(acts[idx])) 84 except: 85 pass 86 if not returnDistances: 87 return clustTree 88 else: 89 return clustTree,dMat
90
91 -def ClusterFromDetails(details):
92 """ Returns the cluster tree 93 94 """ 95 data = MolSimilarity.GetFingerprints(details) 96 if details.maxMols > 0: 97 data = data[:details.maxMols] 98 if details.outFileName: 99 try: 100 outF = open(details.outFileName,'wb+') 101 except IOError: 102 error("Error: could not open output file %s for writing\n"%(details.outFileName)) 103 return None 104 else: 105 outF = None 106 107 if not data: 108 return None 109 110 clustTree = ClusterPoints(data,details.metric,details.clusterAlgo, 111 haveLabels=0,haveActs=1) 112 if outF: 113 cPickle.dump(clustTree,outF) 114 return clustTree
115 116 _usageDoc=""" 117 Usage: ClusterMols.py [args] <fName> 118 119 If <fName> is provided and no tableName is specified (see below), 120 data will be read from the text file <fName>. Text files delimited 121 with either commas (extension .csv) or tabs (extension .txt) are 122 supported. 123 124 Command line arguments are: 125 126 - -d _dbName_: set the name of the database from which 127 to pull input fingerprint information. 128 129 - -t _tableName_: set the name of the database table 130 from which to pull input fingerprint information 131 132 - --idName=val: sets the name of the id column in the input 133 database. Default is *ID*. 134 135 - -o _outFileName_: name of the output file (output will 136 be a pickle (.pkl) file with the cluster tree) 137 138 - --actTable=val: name of table containing activity values 139 (used to color points in the cluster tree). 140 141 - --actName=val: name of column with activities in the activity 142 table. The values in this column should either be integers or 143 convertible into integers. 144 145 - --SLINK: use the single-linkage clustering algorithm 146 (default is Ward's minimum variance) 147 148 - --CLINK: use the complete-linkage clustering algorithm 149 (default is Ward's minimum variance) 150 151 - --UPGMA: use the group-average clustering algorithm 152 (default is Ward's minimum variance) 153 154 - --dice: use the DICE similarity metric instead of Tanimoto 155 156 - --cosine: use the cosine similarity metric instead of Tanimoto 157 158 - --fpColName=val: name to use for the column which stores 159 fingerprints (in pickled format) in the input db table. 160 Default is *AutoFragmentFP* 161 162 - --minPath=val: minimum path length to be included in 163 fragment-based fingerprints. Default is *2*. 164 165 - --maxPath=val: maximum path length to be included in 166 fragment-based fingerprints. Default is *7*. 167 168 - --nBitsPerHash: number of bits to be set in the output 169 fingerprint for each fragment. Default is *4*. 170 171 - --discrim: use of path-based discriminators to hash bits. 172 Default is *false*. 173 174 - -V: include valence information in the fingerprints 175 Default is *false*. 176 177 - -H: include Hs in the fingerprint 178 Default is *false*. 179 180 - --useMACCS: use the public MACCS keys to do the fingerprinting 181 (instead of a daylight-type fingerprint) 182 183 184 """ 185 if __name__ == '__main__': 186 message("This is ClusterMols version %s\n\n"%(__VERSION_STRING)) 187 FingerprintMols._usageDoc=_usageDoc 188 details = FingerprintMols.ParseArgs() 189 ClusterFromDetails(details) 190