Package Chem :: Module MACCSkeys
[hide private]
[frames] | no frames]

Source Code for Module Chem.MACCSkeys

  1  # $Id: MACCSkeys.py 2 2006-05-06 22:54:39Z glandrum $ 
  2  # 
  3  # Copyright (C) 2001-2006 greg Landrum and Rational Discovery LLC 
  4  # 
  5  #   @@ All Rights Reserved  @@ 
  6  # 
  7  """ SMARTS definitions for the publically available MACCS keys 
  8  and a MACCS fingerprinter 
  9   
 10  """ 
 11  import Chem 
 12  import DataStructs 
 13  # these are SMARTS patterns corresponding to the MDL MACCS keys 
 14  smartsPatts={ 
 15    1:('?',0), # ISOTOPE 
 16    #2:('[#103,#104,#105,#106,#107,#106,#109,#110,#111,#112]',0),  # ISOTOPE Not complete 
 17    2:('[#103,#104]',0),  # ISOTOPE Not complete 
 18    3:('[Ge,As,Se,Sn,Sb,Te,Tl,Pb,Bi]',0), # Group IVa,Va,VIa Periods 4-6 (Ge...)  *NOTE* spec wrong 
 19    4:('[Ac,Th,Pa,U,Np,Pu,Am,Cm,Bk,Cf,Es,Fm,Md,No,Lr]',0), # actinide 
 20    5:('[Sc,Ti,Y,Zr,Hf]',0), # Group IIIB,IVB (Sc...)  *NOTE* spec wrong 
 21    6:('[La,Ce,Pr,Nd,Pm,Sm,Eu,Gd,Tb,Dy,Ho,Er,Tm,Yb,Lu]',0), # Lanthanide 
 22    7:('[V,Cr,Mn,Nb,Mo,Tc,Ta,W,Re]',0), # Group VB,VIB,VIIB (V...) *NOTE* spec wrong 
 23    8:('[!C;!c;!#1]1~*~*~*~1',0), # QAAA@1 
 24    9:('[Fe,Co,Ni,Ru,Rh,Pd,Os,Ir,Pt]',0), # Group VIII (Fe...) 
 25    10:('[Be,Mg,Ca,Sr,Ba,Ra]',0), # Group IIa (Alkaline earth) 
 26    11:('*1~*~*~*~1',0), # 4M Ring 
 27    12:('[Cu,Zn,Ag,Cd,Au,Hg]',0), # Group IB,IIB (Cu..) 
 28    13:('[O,o]~[N,n](~[C,c])~[C,c]',0), # ON(C)C 
 29    14:('[S,s]-[S,s]',0), # S-S 
 30    15:('[O,o]~[C,c](~[O,o])~[O,o]',0), # OC(O)O 
 31    16:('[!C;!c;!#1]1~*~*~1',0), # QAA@1 
 32    17:('[C,c]#[C,c]',0), #CTC 
 33    18:('[B,Al,Ga,In,Tl]',0), # Group IIIA (B...) *NOTE* spec wrong 
 34    19:('*1~*~*~*~*~*~*~1',0), # 7M Ring 
 35    20:('[Si]',0), #Si 
 36    21:('[C,c]=[C,c](~[!C;!c;!#1])~[!C;!c;!#1]',0), # C=C(Q)Q 
 37    22:('*1~*~*~1',0), # 3M Ring 
 38    23:('[N,n]~[C,c](~[O,o])~[O,o]',0), # NC(O)O 
 39    24:('[N,n]-[O,o]',0), # N-O 
 40    25:('[N,n]~[C,c](~[N,n])~[N,n]',0), # NC(N)N 
 41    26:('[C,c]=;@[C,c](@*)@*',0), # C$=C($A)$A 
 42    27:('[I]',0), # I 
 43    28:('[!C;!c;!#1]~[CH2]~[!C;!c;!#1]',0), # QCH2Q 
 44    29:('P',0),# P 
 45    30:('[C,c]~[!C;!c;!#1](~[C,c])(~[C,c])~*',0), # CQ(C)(C)A 
 46    31:('[!C;!c;!#1]~[F,Cl,Br,I]',0), # QX 
 47    32:('[C,c]~[S,s]~[N,n]',0), # CSN 
 48    33:('[N,n]~[S,s]',0), # NS 
 49    34:('[CH2]=*',0), # CH2=A 
 50    35:('[Li,Na,K,Rb,Cs,Fr]',0), # Group IA (Alkali Metal) 
 51    36:('[$(S@*),$(s@*)]',0), # S Heterocycle 
 52    37:('[N,n]~[C,c](~[O,o])~[N,n]',0), # NC(O)N 
 53    38:('[N,n]~[C,c](~[C,c])~[N,n]',0), # NC(C)N 
 54    39:('[O,o]~[S,s](~[O,o])~[O,o]',0), # OS(O)O 
 55    40:('[S,s]-[O,o]',0), # S-O 
 56    41:('[C,c]#[N,n]',0), # CTN 
 57    42:('F',0), # F 
 58    43:('[!C;!c;!#1;H,H2,H3,H4]~*~[!C;!c;!#1;H,H2,H3,H4]',0), # QHAQH FIX: possibly incomplete 
 59    44:('?',0), # OTHER 
 60    45:('[C,c]=[C,c]~[N,n]',0), # C=CN 
 61    46:('Br',0), # BR 
 62    47:('[S,s]~*~[N,n]',0), # SAN 
 63    48:('[O,o]~[!C;!c;!#1](~[O,o])(~[O,o])~*',0), # OQ(O)O 
 64    49:('[-,-2,-3,-4,+,+2,+3,+4]',0), # CHARGE  FIX: possibly incomplete 
 65    50:('[C,c]=[C,c](~[C,c])~[C,c]',0), # C=C(C)C 
 66    51:('[C,c]~[S,s]~[O,o]',0), # CSO 
 67    52:('[N,n]~[N,n]',0), # NN 
 68    53:('[!C;!c;!#1;H,H2,H3,H4]~*~*~*~[!C;!c;!#1;H,H2,H3,H4]',0), # QHAAAQH FIX: possibly incomplete 
 69    54:('[!C;!c;!#1;H,H2,H3,H4]~*~*~[!C;!c;!#1;H,H2,H3,H4]',0), # QHAAQH FIX: possibly incomplete 
 70    55:('[O,o]~[S,s]~[O,o]',0), #OSO 
 71    56:('[O,o]~[N,n](~[O,o])~[C,c]',0), # ON(O)C 
 72    57:('[$(O@*),$(o@*)]',0), # O Heterocycle 
 73    58:('[!C;!c;!#1]~[S,s]~[!C;!c;!#1]',0), # QSQ 
 74    59:('[S,s]!:*:*',0), # Snot%A%A 
 75    60:('[S,s]=[O,o]',0), # S=O 
 76    61:('*~[S,s](~*)~*',0), # AS(A)A 
 77    62:('*@*!@*@*',0), # A$!A$A 
 78    63:('[N,n]=[O,o]',0), # N=O 
 79    64:('*@*!@[S,s]',0), # A$A!S 
 80    65:('[C,c]:[N,n]',0), # C%N 
 81    66:('[C,c]~[C,c](~[C,c])(~[C,c])~*',0), # CC(C)(C)A 
 82    67:('[!C;!c;!#1]~[S,s]',0), # QS 
 83    68:('[!C;!c;!#1;H,H2,H3,H4]~[!C;!c;!#1;H,H2,H3,H4]',0), # QHQH FIX: possibly incomplete 
 84    69:('[!C;!c;!#1]~[!C;!c;!#1;H,H2,H3,H4]',0), # QQH FIX: possibly incomplete 
 85    70:('[!C;!c;!#1]~[N,n]~[!C;!c;!#1]',0), # QNQ 
 86    71:('[N,n]~[O,o]',0), # NO 
 87    72:('[O,o]~*~*~[O,o]',0), # OAAO 
 88    73:('[S,s]=*',0), # S=A 
 89    74:('[CH3]~*~[CH3]',0), # CH3ACH3 
 90    75:('*!@[N,n]@*',0), # A!N$A 
 91    76:('[C,c]=[C,c](~*)~*',0), # C=C(A)A 
 92    77:('[N,n]~*~[N,n]',0), # NAN 
 93    78:('[C,c]=[N,n]',0), # C=N 
 94    79:('[N,n]~*~*~[N,n]',0), # NAAN 
 95    80:('[N,n]~*~*~*~[N,n]',0), # NAAAN 
 96    81:('[S,s]~*(~*)~*',0), # SA(A)A 
 97    82:('*~[CH2]~[!C;!c;!#1;H,H2,H3,H4]',0), # ACH2QH 
 98    83:('[!C;!c;!#1]1~*~*~*~*~1',0), # QAAAA@1 
 99    84:('[NH2]',0), #NH2 
100    85:('[C,c]~[N,n](~[C,c])~[C,c]',0), # CN(C)C 
101    86:('[CH2][!C;!c;!#1][CH2]',0), # CH2QCH2 
102    87:('[F,Cl,Br,I]!@*@*',0), # X!A$A 
103    88:('[S,s]',0), # S 
104    89:('[O,o]~*~*~*~[O,o]',0), # OAAAO 
105    90:('[!C;!c;!#1;H,H2,H3,H4]~*~*~[CH2]~*',0), # QHAACH2A 
106    91:('[!C;!c;!#1;H,H2,H3,H4]~*~*~*~[CH2]~*',0), # QHAAACH2A 
107    92:('[O,o]~[C,c](~[N,n])~[C,c]',0), # OC(N)C 
108    93:('[!C;!c;!#1]~[CH3]',0), # QCH3 
109    94:('[!C;!c;!#1]~[N,n]',0), # QN 
110    95:('[N,n]~*~*~[O,o]',0), # NAAO 
111    96:('*1~*~*~*~*~1',0), # 5 M ring 
112    97:('[N,n]~*~*~*~[O,o]',0), # NAAAO 
113    98:('[!C;!c;!#1]1~*~*~*~*~*~1',0), # QAAAAA@1 
114    99:('[C,c]=[C,c]',0), # C=C 
115    100:('*~[CH2]~[N,n]',0), # ACH2N 
116    101:('[r8,r9,r10,r11,r12]',0), # 8M Ring or larger FIX: This is not exhaustive and it appears that oelib doesn't do this right 
117    102:('[!C;!c;!#1]~[O,o]',0), # QO 
118    103:('Cl',0), # CL 
119    104:('[!C;!c;!#1;H,H2,H3,H4]~*~[CH2]~*',0), # QHACH2A 
120    105:('[!C;!c;!#1]@*(@*)@*',0), # A$A($A)$A 
121    106:('[!C;!c;!#1]~*(~[!C;!c;!#1])~[!C;!c;!#1]',0), # QA(Q)Q 
122    107:('[F,Cl,Br,I]~*(~*)~*',0), # XA(A)A 
123    108:('[CH3]~*~*~*~[CH2]~*',0), # CH3AAACH2A 
124    109:('*~[CH2]~[O,o]',0), # ACH2O 
125    110:('[N,n]~[C,c]~[O,o]',0), # NCO 
126    111:('[N,n]~*~[CH2]~*',0), # NACH2A 
127    112:('*~*(~*)(~*)~*',0), # AA(A)(A)A 
128    113:('[O,o]!:*:*',0), # Onot%A%A 
129    114:('[CH3]~[CH2]~*',0), # CH3CH2A 
130    115:('[CH3]~*~[CH2]~*',0), # CH3ACH2A 
131    116:('[CH3]~*~*~[CH2]~*',0), # CH3AACH2A 
132    117:('[N,n]~*~[O,o]',0), # NAO 
133    118:('*~[CH2]~[CH2]~*',1), # ACH2CH2A > 1 
134    119:('[N,n]=*',0), # N=A 
135    120:('[!C;!c;R]',1), # Heterocyclic atom > 1 
136    121:('[$(N@*),$(n@*)]',0), # N Heterocycle 
137    122:('*~[N,n](~*)~*',0), # AN(A)A 
138    123:('[O,o]~[C,c]~[O,o]',0), # OCO 
139    124:('[!C;!c;!#1]~[!C;!c;!#1]',0), # QQ 
140    125:('?',0), # Aromatic Ring > 1 
141    126:('*!@[O,o]!@*',0), # A!O!A 
142    127:('*@*!@[O,o]',1), # A$A!O > 1 
143    128:('*~[CH2]~*~*~*~[CH2]~*',0), # ACH2AAACH2A 
144    129:('*~[CH2]~*~*~[CH2]~*',0), # ACH2AACH2A 
145    130:('[!C;!c;!#1]~[!C;!c;!#1]',1), # QQ > 1 (&...) 
146    131:('[!C;!c;!#1;H,H2,H3,H4]',1), # QH > 1 
147    132:('[O,o]~*~[CH2]~*',0), # OACH2A 
148    133:('*@*!:[N,n]',0), # A$A!N 
149    134:('[F,Cl,Br,I]',0), # X (HALOGEN) 
150    135:('[N,n]!:*:*',0), # Nnot%A%A 
151    136:('[O,o]=*',1), # O=A>1 FIX: maybe not right key 
152    137:('[!C;!c;R]',0), # Heterocycle 
153    138:('[!C;!c;!#1]~[CH2]~*',1), # QCH2A>1 (&...) 
154    139:('[OH,OH2,OH3]',0), # OH 
155    140:('[O,o]',3), # O > 3 
156    141:('[CH3]',2), # CH3 > 2 
157    142:('[N,n]',1), # N > 1 
158    143:('*@*!@[O,o]',0), # A$A!O 
159    144:('*!:*:*!:*',0), # Anot%A%Anot%A 
160    145:('*1~*~*~*~*~*~1',1), # 6M ring > 1 
161    146:('[O,o]',2), # O > 2 
162    147:('*~[CH2]~[CH2]~*',0), # ACH2CH2A 
163    148:('*~[!C;!c;!#1](~*)~*',0), # AQ(A)A 
164    149:('[CH3]',1), # CH3 > 1 
165    150:('*!@*@*!@*',0), # A!A$A!A 
166    151:('[NH,NH2,NH3,NH4]',0), # NH 
167    152:('[O,o]~[C,c](~[C,c])~[C,c]',0), # OC(C)C 
168    153:('[!C;!c;!#1]~[CH2]~*',0), # QCH2A 
169    154:('[C,c]=[O,o]',0), # C=O 
170    155:('*!@[CH2]!@*',0), # A!CH2!A 
171    156:('[N,n]~*(~*)~*',0), # NA(A)A 
172    157:('[C,c]-[O,o]',0), # C-O 
173    158:('[C,c]-[N,n]',0), # C-N 
174    159:('[O,o]',1), # O>1 
175    160:('[CH3]',0), #CH3 
176    161:('[N,n]',0), # N 
177    162:('a',0), # Aromatic 
178    163:('*1~*~*~*~*~*~1',0), # 6M Ring 
179    164:('[O,o]',0), # O 
180    165:('[R]',0), # Ring 
181    166:('?',0), # Fragments  FIX: this should be (*).(*), but that doesn't work properly in oelib 
182    } 
183   
184  maccsKeys = None 
185   
186 -def _InitKeys(keyList,keyDict):
187 """ *Internal Use Only* 188 189 generates SMARTS patterns for the keys, run once 190 191 """ 192 assert len(keyList) == len(keyDict.keys()),'length mismatch' 193 for key in keyDict.keys(): 194 patt,count = keyDict[key] 195 if patt != '?': 196 try: 197 sma = Chem.MolFromSmarts(patt) 198 except: 199 sma = None 200 if not sma: 201 print 'SMARTS parser error for key #%d: %s'%(key,patt) 202 else: 203 keyList[key-1] = sma,count
204
205 -def GenMACCSKeys(mol,**kwargs):
206 """ generates the MACCS fingerprint for a molecules 207 208 **Arguments** 209 210 - mol: the molecule to be fingerprinted 211 212 - any extra keyword arguments are ignored 213 214 **Returns** 215 216 a _DataStructs.SparseBitVect_ containing the fingerprint. 217 218 >>> m = Chem.MolFromSmiles('CNO') 219 >>> bv = GenMACCSKeys(m) 220 >>> tuple(bv.GetOnBits()) 221 (24, 68, 69, 71, 93, 94, 102, 124, 131, 139, 151, 158, 160, 161, 164) 222 >>> bv = GenMACCSKeys(Chem.MolFromSmiles('CCC')) 223 >>> tuple(bv.GetOnBits()) 224 (74, 114, 149, 155, 160) 225 226 """ 227 global maccsKeys 228 if maccsKeys is None: 229 maccsKeys = [(None,0)]*len(smartsPatts.keys()) 230 _InitKeys(maccsKeys,smartsPatts) 231 232 res = DataStructs.SparseBitVect(len(maccsKeys)+1) 233 for i,(patt,count) in enumerate(maccsKeys): 234 if patt is not None: 235 matches = mol.GetSubstructMatches(patt) 236 if matches: 237 if count == 0: 238 res[i+1] = 1 239 else: 240 if len(matches) > count: 241 res[i+1] = 1 242 return res
243 244 FingerprintMol = GenMACCSKeys 245 246 #------------------------------------ 247 # 248 # doctest boilerplate 249 #
250 -def _test():
251 import doctest,sys 252 return doctest.testmod(sys.modules["__main__"])
253 254 if __name__ == '__main__': 255 import sys 256 failed,tried = _test() 257 sys.exit(failed) 258