Package rdkit :: Package Chem :: Module MACCSkeys
[hide private]
[frames] | no frames]

Source Code for Module rdkit.Chem.MACCSkeys

  1  # $Id$ 
  2  # 
  3  # Copyright (C) 2001-2011 greg Landrum and Rational Discovery LLC 
  4  # 
  5  #   @@ All Rights Reserved @@ 
  6  #  This file is part of the RDKit. 
  7  #  The contents are covered by the terms of the BSD license 
  8  #  which is included in the file license.txt, found at the root 
  9  #  of the RDKit source tree. 
 10  # 
 11  """ SMARTS definitions for the publically available MACCS keys 
 12  and a MACCS fingerprinter 
 13   
 14  I compared the MACCS fingerprints generated here with those from two 
 15  other packages (not MDL, unfortunately). Of course there are 
 16  disagreements between the various fingerprints still, but I think 
 17  these definitions work pretty well. Some notes: 
 18   
 19  1) most of the differences have to do with aromaticity 
 20  2) there's a discrepancy sometimes because the current RDKit 
 21  definitions do not require multiple matches to be distinct. e.g. the 
 22  SMILES C(=O)CC(=O) can match the (hypothetical) key O=CC twice in my 
 23  definition. It's not clear to me what the correct behavior is. 
 24  3) Some keys are not fully defined in the MDL documentation 
 25  4) Two keys, 125 and 166, have to be done outside of SMARTS. 
 26  5) Key 1 (ISOTOPE) isn't defined 
 27   
 28  Rev history: 
 29  2006 (gl): Original open-source release 
 30  May 2011 (gl): Update some definitions based on feedback from Andrew Dalke 
 31   
 32  """ 
 33  from __future__ import print_function 
 34  from rdkit import Chem 
 35  from rdkit.Chem import rdMolDescriptors 
 36  from rdkit import DataStructs 
 37  # these are SMARTS patterns corresponding to the MDL MACCS keys 
 38  smartsPatts = { 
 39    1: ('?', 0),  # ISOTOPE 
 40    #2:('[#104,#105,#106,#107,#106,#109,#110,#111,#112]',0),  # atomic num >103 Not complete 
 41    2: ('[#104]', 0),  # limit the above def'n since the RDKit only accepts up to #104 
 42    3: ('[#32,#33,#34,#50,#51,#52,#82,#83,#84]', 0),  # Group IVa,Va,VIa Rows 4-6  
 43    4: ('[Ac,Th,Pa,U,Np,Pu,Am,Cm,Bk,Cf,Es,Fm,Md,No,Lr]', 0),  # actinide 
 44    5: ('[Sc,Ti,Y,Zr,Hf]', 0),  # Group IIIB,IVB (Sc...)   
 45    6: ('[La,Ce,Pr,Nd,Pm,Sm,Eu,Gd,Tb,Dy,Ho,Er,Tm,Yb,Lu]', 0),  # Lanthanide 
 46    7: ('[V,Cr,Mn,Nb,Mo,Tc,Ta,W,Re]', 0),  # Group VB,VIB,VIIB 
 47    8: ('[!#6;!#1]1~*~*~*~1', 0),  # QAAA@1 
 48    9: ('[Fe,Co,Ni,Ru,Rh,Pd,Os,Ir,Pt]', 0),  # Group VIII (Fe...) 
 49    10: ('[Be,Mg,Ca,Sr,Ba,Ra]', 0),  # Group IIa (Alkaline earth) 
 50    11: ('*1~*~*~*~1', 0),  # 4M Ring 
 51    12: ('[Cu,Zn,Ag,Cd,Au,Hg]', 0),  # Group IB,IIB (Cu..) 
 52    13: ('[#8]~[#7](~[#6])~[#6]', 0),  # ON(C)C 
 53    14: ('[#16]-[#16]', 0),  # S-S 
 54    15: ('[#8]~[#6](~[#8])~[#8]', 0),  # OC(O)O 
 55    16: ('[!#6;!#1]1~*~*~1', 0),  # QAA@1 
 56    17: ('[#6]#[#6]', 0),  #CTC 
 57    18: ('[#5,#13,#31,#49,#81]', 0),  # Group IIIA (B...)  
 58    19: ('*1~*~*~*~*~*~*~1', 0),  # 7M Ring 
 59    20: ('[#14]', 0),  #Si 
 60    21: ('[#6]=[#6](~[!#6;!#1])~[!#6;!#1]', 0),  # C=C(Q)Q 
 61    22: ('*1~*~*~1', 0),  # 3M Ring 
 62    23: ('[#7]~[#6](~[#8])~[#8]', 0),  # NC(O)O 
 63    24: ('[#7]-[#8]', 0),  # N-O 
 64    25: ('[#7]~[#6](~[#7])~[#7]', 0),  # NC(N)N 
 65    26: ('[#6]=;@[#6](@*)@*', 0),  # C$=C($A)$A 
 66    27: ('[I]', 0),  # I 
 67    28: ('[!#6;!#1]~[CH2]~[!#6;!#1]', 0),  # QCH2Q 
 68    29: ('[#15]', 0),  # P 
 69    30: ('[#6]~[!#6;!#1](~[#6])(~[#6])~*', 0),  # CQ(C)(C)A 
 70    31: ('[!#6;!#1]~[F,Cl,Br,I]', 0),  # QX 
 71    32: ('[#6]~[#16]~[#7]', 0),  # CSN 
 72    33: ('[#7]~[#16]', 0),  # NS 
 73    34: ('[CH2]=*', 0),  # CH2=A 
 74    35: ('[Li,Na,K,Rb,Cs,Fr]', 0),  # Group IA (Alkali Metal) 
 75    36: ('[#16R]', 0),  # S Heterocycle 
 76    37: ('[#7]~[#6](~[#8])~[#7]', 0),  # NC(O)N 
 77    38: ('[#7]~[#6](~[#6])~[#7]', 0),  # NC(C)N 
 78    39: ('[#8]~[#16](~[#8])~[#8]', 0),  # OS(O)O 
 79    40: ('[#16]-[#8]', 0),  # S-O 
 80    41: ('[#6]#[#7]', 0),  # CTN 
 81    42: ('F', 0),  # F 
 82    43: ('[!#6;!#1;!H0]~*~[!#6;!#1;!H0]', 0),  # QHAQH 
 83    44: ('[!#1;!#6;!#7;!#8;!#9;!#14;!#15;!#16;!#17;!#35;!#53]', 0),  # OTHER 
 84    45: ('[#6]=[#6]~[#7]', 0),  # C=CN 
 85    46: ('Br', 0),  # BR 
 86    47: ('[#16]~*~[#7]', 0),  # SAN 
 87    48: ('[#8]~[!#6;!#1](~[#8])(~[#8])', 0),  # OQ(O)O 
 88    49: ('[!+0]', 0),  # CHARGE   
 89    50: ('[#6]=[#6](~[#6])~[#6]', 0),  # C=C(C)C 
 90    51: ('[#6]~[#16]~[#8]', 0),  # CSO 
 91    52: ('[#7]~[#7]', 0),  # NN 
 92    53: ('[!#6;!#1;!H0]~*~*~*~[!#6;!#1;!H0]', 0),  # QHAAAQH 
 93    54: ('[!#6;!#1;!H0]~*~*~[!#6;!#1;!H0]', 0),  # QHAAQH 
 94    55: ('[#8]~[#16]~[#8]', 0),  #OSO 
 95    56: ('[#8]~[#7](~[#8])~[#6]', 0),  # ON(O)C 
 96    57: ('[#8R]', 0),  # O Heterocycle 
 97    58: ('[!#6;!#1]~[#16]~[!#6;!#1]', 0),  # QSQ 
 98    59: ('[#16]!:*:*', 0),  # Snot%A%A 
 99    60: ('[#16]=[#8]', 0),  # S=O 
100    61: ('*~[#16](~*)~*', 0),  # AS(A)A 
101    62: ('*@*!@*@*', 0),  # A$!A$A 
102    63: ('[#7]=[#8]', 0),  # N=O 
103    64: ('*@*!@[#16]', 0),  # A$A!S 
104    65: ('c:n', 0),  # C%N 
105    66: ('[#6]~[#6](~[#6])(~[#6])~*', 0),  # CC(C)(C)A 
106    67: ('[!#6;!#1]~[#16]', 0),  # QS 
107    68: ('[!#6;!#1;!H0]~[!#6;!#1;!H0]', 0),  # QHQH (&...) SPEC Incomplete 
108    69: ('[!#6;!#1]~[!#6;!#1;!H0]', 0),  # QQH 
109    70: ('[!#6;!#1]~[#7]~[!#6;!#1]', 0),  # QNQ 
110    71: ('[#7]~[#8]', 0),  # NO 
111    72: ('[#8]~*~*~[#8]', 0),  # OAAO 
112    73: ('[#16]=*', 0),  # S=A 
113    74: ('[CH3]~*~[CH3]', 0),  # CH3ACH3 
114    75: ('*!@[#7]@*', 0),  # A!N$A 
115    76: ('[#6]=[#6](~*)~*', 0),  # C=C(A)A 
116    77: ('[#7]~*~[#7]', 0),  # NAN 
117    78: ('[#6]=[#7]', 0),  # C=N 
118    79: ('[#7]~*~*~[#7]', 0),  # NAAN 
119    80: ('[#7]~*~*~*~[#7]', 0),  # NAAAN 
120    81: ('[#16]~*(~*)~*', 0),  # SA(A)A 
121    82: ('*~[CH2]~[!#6;!#1;!H0]', 0),  # ACH2QH 
122    83: ('[!#6;!#1]1~*~*~*~*~1', 0),  # QAAAA@1 
123    84: ('[NH2]', 0),  #NH2 
124    85: ('[#6]~[#7](~[#6])~[#6]', 0),  # CN(C)C 
125    86: ('[C;H2,H3][!#6;!#1][C;H2,H3]', 0),  # CH2QCH2 
126    87: ('[F,Cl,Br,I]!@*@*', 0),  # X!A$A 
127    88: ('[#16]', 0),  # S 
128    89: ('[#8]~*~*~*~[#8]', 0),  # OAAAO 
129    90: 
130    ('[$([!#6;!#1;!H0]~*~*~[CH2]~*),$([!#6;!#1;!H0;R]1@[R]@[R]@[CH2;R]1),$([!#6;!#1;!H0]~[R]1@[R]@[CH2;R]1)]', 
131     0),  # QHAACH2A 
132    91: 
133    ('[$([!#6;!#1;!H0]~*~*~*~[CH2]~*),$([!#6;!#1;!H0;R]1@[R]@[R]@[R]@[CH2;R]1),$([!#6;!#1;!H0]~[R]1@[R]@[R]@[CH2;R]1),$([!#6;!#1;!H0]~*~[R]1@[R]@[CH2;R]1)]', 
134     0),  # QHAAACH2A 
135    92: ('[#8]~[#6](~[#7])~[#6]', 0),  # OC(N)C 
136    93: ('[!#6;!#1]~[CH3]', 0),  # QCH3 
137    94: ('[!#6;!#1]~[#7]', 0),  # QN 
138    95: ('[#7]~*~*~[#8]', 0),  # NAAO 
139    96: ('*1~*~*~*~*~1', 0),  # 5 M ring 
140    97: ('[#7]~*~*~*~[#8]', 0),  # NAAAO 
141    98: ('[!#6;!#1]1~*~*~*~*~*~1', 0),  # QAAAAA@1 
142    99: ('[#6]=[#6]', 0),  # C=C 
143    100: ('*~[CH2]~[#7]', 0),  # ACH2N 
144    101: 
145    ('[$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1),$([R]@1@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]@[R]1)]', 
146     0),  # 8M Ring or larger. This only handles up to ring sizes of 14 
147    102: ('[!#6;!#1]~[#8]', 0),  # QO 
148    103: ('Cl', 0),  # CL 
149    104: ('[!#6;!#1;!H0]~*~[CH2]~*', 0),  # QHACH2A 
150    105: ('*@*(@*)@*', 0),  # A$A($A)$A 
151    106: ('[!#6;!#1]~*(~[!#6;!#1])~[!#6;!#1]', 0),  # QA(Q)Q 
152    107: ('[F,Cl,Br,I]~*(~*)~*', 0),  # XA(A)A 
153    108: ('[CH3]~*~*~*~[CH2]~*', 0),  # CH3AAACH2A 
154    109: ('*~[CH2]~[#8]', 0),  # ACH2O 
155    110: ('[#7]~[#6]~[#8]', 0),  # NCO 
156    111: ('[#7]~*~[CH2]~*', 0),  # NACH2A 
157    112: ('*~*(~*)(~*)~*', 0),  # AA(A)(A)A 
158    113: ('[#8]!:*:*', 0),  # Onot%A%A 
159    114: ('[CH3]~[CH2]~*', 0),  # CH3CH2A 
160    115: ('[CH3]~*~[CH2]~*', 0),  # CH3ACH2A 
161    116: ('[$([CH3]~*~*~[CH2]~*),$([CH3]~*1~*~[CH2]1)]', 0),  # CH3AACH2A 
162    117: ('[#7]~*~[#8]', 0),  # NAO 
163    118: ('[$(*~[CH2]~[CH2]~*),$(*1~[CH2]~[CH2]1)]', 1),  # ACH2CH2A > 1 
164    119: ('[#7]=*', 0),  # N=A 
165    120: ('[!#6;R]', 1),  # Heterocyclic atom > 1 (&...) Spec Incomplete 
166    121: ('[#7;R]', 0),  # N Heterocycle 
167    122: ('*~[#7](~*)~*', 0),  # AN(A)A 
168    123: ('[#8]~[#6]~[#8]', 0),  # OCO 
169    124: ('[!#6;!#1]~[!#6;!#1]', 0),  # QQ 
170    125: ('?', 0),  # Aromatic Ring > 1 
171    126: ('*!@[#8]!@*', 0),  # A!O!A 
172    127: ('*@*!@[#8]', 1),  # A$A!O > 1 (&...) Spec Incomplete 
173    128: 
174    ('[$(*~[CH2]~*~*~*~[CH2]~*),$([R]1@[CH2;R]@[R]@[R]@[R]@[CH2;R]1),$(*~[CH2]~[R]1@[R]@[R]@[CH2;R]1),$(*~[CH2]~*~[R]1@[R]@[CH2;R]1)]', 
175     0),  # ACH2AAACH2A 
176    129: ('[$(*~[CH2]~*~*~[CH2]~*),$([R]1@[CH2]@[R]@[R]@[CH2;R]1),$(*~[CH2]~[R]1@[R]@[CH2;R]1)]', 
177          0),  # ACH2AACH2A 
178    130: ('[!#6;!#1]~[!#6;!#1]', 1),  # QQ > 1 (&...)  Spec Incomplete 
179    131: ('[!#6;!#1;!H0]', 1),  # QH > 1 
180    132: ('[#8]~*~[CH2]~*', 0),  # OACH2A 
181    133: ('*@*!@[#7]', 0),  # A$A!N 
182    134: ('[F,Cl,Br,I]', 0),  # X (HALOGEN) 
183    135: ('[#7]!:*:*', 0),  # Nnot%A%A 
184    136: ('[#8]=*', 1),  # O=A>1  
185    137: ('[!C;!c;R]', 0),  # Heterocycle 
186    138: ('[!#6;!#1]~[CH2]~*', 1),  # QCH2A>1 (&...) Spec Incomplete 
187    139: ('[O;!H0]', 0),  # OH 
188    140: ('[#8]', 3),  # O > 3 (&...) Spec Incomplete 
189    141: ('[CH3]', 2),  # CH3 > 2  (&...) Spec Incomplete 
190    142: ('[#7]', 1),  # N > 1 
191    143: ('*@*!@[#8]', 0),  # A$A!O 
192    144: ('*!:*:*!:*', 0),  # Anot%A%Anot%A 
193    145: ('*1~*~*~*~*~*~1', 1),  # 6M ring > 1 
194    146: ('[#8]', 2),  # O > 2 
195    147: ('[$(*~[CH2]~[CH2]~*),$([R]1@[CH2;R]@[CH2;R]1)]', 0),  # ACH2CH2A 
196    148: ('*~[!#6;!#1](~*)~*', 0),  # AQ(A)A 
197    149: ('[C;H3,H4]', 1),  # CH3 > 1 
198    150: ('*!@*@*!@*', 0),  # A!A$A!A 
199    151: ('[#7;!H0]', 0),  # NH 
200    152: ('[#8]~[#6](~[#6])~[#6]', 0),  # OC(C)C 
201    153: ('[!#6;!#1]~[CH2]~*', 0),  # QCH2A 
202    154: ('[#6]=[#8]', 0),  # C=O 
203    155: ('*!@[CH2]!@*', 0),  # A!CH2!A 
204    156: ('[#7]~*(~*)~*', 0),  # NA(A)A 
205    157: ('[#6]-[#8]', 0),  # C-O 
206    158: ('[#6]-[#7]', 0),  # C-N 
207    159: ('[#8]', 1),  # O>1 
208    160: ('[C;H3,H4]', 0),  #CH3 
209    161: ('[#7]', 0),  # N 
210    162: ('a', 0),  # Aromatic 
211    163: ('*1~*~*~*~*~*~1', 0),  # 6M Ring 
212    164: ('[#8]', 0),  # O 
213    165: ('[R]', 0),  # Ring 
214    166: ('?', 0),  # Fragments  FIX: this can't be done in SMARTS 
215  } 
216   
217  maccsKeys = None 
218   
219   
220 -def _InitKeys(keyList, keyDict):
221 """ *Internal Use Only* 222 223 generates SMARTS patterns for the keys, run once 224 225 """ 226 assert len(keyList) == len(keyDict.keys()), 'length mismatch' 227 for key in keyDict.keys(): 228 patt, count = keyDict[key] 229 if patt != '?': 230 sma = Chem.MolFromSmarts(patt) 231 if not sma: 232 print('SMARTS parser error for key #%d: %s' % (key, patt)) 233 else: 234 keyList[key - 1] = sma, count
235 236
237 -def _pyGenMACCSKeys(mol, **kwargs):
238 """ generates the MACCS fingerprint for a molecules 239 240 **Arguments** 241 242 - mol: the molecule to be fingerprinted 243 244 - any extra keyword arguments are ignored 245 246 **Returns** 247 248 a _DataStructs.SparseBitVect_ containing the fingerprint. 249 250 >>> m = Chem.MolFromSmiles('CNO') 251 >>> bv = GenMACCSKeys(m) 252 >>> tuple(bv.GetOnBits()) 253 (24, 68, 69, 71, 93, 94, 102, 124, 131, 139, 151, 158, 160, 161, 164) 254 >>> bv = GenMACCSKeys(Chem.MolFromSmiles('CCC')) 255 >>> tuple(bv.GetOnBits()) 256 (74, 114, 149, 155, 160) 257 258 """ 259 global maccsKeys 260 if maccsKeys is None: 261 maccsKeys = [(None, 0)] * len(smartsPatts.keys()) 262 _InitKeys(maccsKeys, smartsPatts) 263 ctor = kwargs.get('ctor', DataStructs.SparseBitVect) 264 265 res = ctor(len(maccsKeys) + 1) 266 for i, (patt, count) in enumerate(maccsKeys): 267 if patt is not None: 268 if count == 0: 269 res[i + 1] = mol.HasSubstructMatch(patt) 270 else: 271 matches = mol.GetSubstructMatches(patt) 272 if len(matches) > count: 273 res[i + 1] = 1 274 elif (i + 1) == 125: 275 # special case: num aromatic rings > 1 276 ri = mol.GetRingInfo() 277 nArom = 0 278 res[125] = 0 279 for ring in ri.BondRings(): 280 isArom = True 281 for bondIdx in ring: 282 if not mol.GetBondWithIdx(bondIdx).GetIsAromatic(): 283 isArom = False 284 break 285 if isArom: 286 nArom += 1 287 if nArom > 1: 288 res[125] = 1 289 break 290 elif (i + 1) == 166: 291 res[166] = 0 292 # special case: num frags > 1 293 if len(Chem.GetMolFrags(mol)) > 1: 294 res[166] = 1 295 296 return res
297 298 299 GenMACCSKeys = rdMolDescriptors.GetMACCSKeysFingerprint 300 FingerprintMol = rdMolDescriptors.GetMACCSKeysFingerprint 301 302 303 #------------------------------------ 304 # 305 # doctest boilerplate 306 #
307 -def _test():
308 import doctest, sys 309 return doctest.testmod(sys.modules["__main__"])
310 311 312 if __name__ == '__main__': 313 import sys 314 failed, tried = _test() 315 sys.exit(failed) 316