1
2
3
4
5
6
7 """ SMARTS definitions for the publically available MACCS keys
8 and a MACCS fingerprinter
9
10 """
11 import Chem
12 import DataStructs
13
14 smartsPatts={
15 1:('?',0),
16
17 2:('[#103,#104]',0),
18 3:('[Ge,As,Se,Sn,Sb,Te,Tl,Pb,Bi]',0),
19 4:('[Ac,Th,Pa,U,Np,Pu,Am,Cm,Bk,Cf,Es,Fm,Md,No,Lr]',0),
20 5:('[Sc,Ti,Y,Zr,Hf]',0),
21 6:('[La,Ce,Pr,Nd,Pm,Sm,Eu,Gd,Tb,Dy,Ho,Er,Tm,Yb,Lu]',0),
22 7:('[V,Cr,Mn,Nb,Mo,Tc,Ta,W,Re]',0),
23 8:('[!C;!c;!#1]1~*~*~*~1',0),
24 9:('[Fe,Co,Ni,Ru,Rh,Pd,Os,Ir,Pt]',0),
25 10:('[Be,Mg,Ca,Sr,Ba,Ra]',0),
26 11:('*1~*~*~*~1',0),
27 12:('[Cu,Zn,Ag,Cd,Au,Hg]',0),
28 13:('[O,o]~[N,n](~[C,c])~[C,c]',0),
29 14:('[S,s]-[S,s]',0),
30 15:('[O,o]~[C,c](~[O,o])~[O,o]',0),
31 16:('[!C;!c;!#1]1~*~*~1',0),
32 17:('[C,c]#[C,c]',0),
33 18:('[B,Al,Ga,In,Tl]',0),
34 19:('*1~*~*~*~*~*~*~1',0),
35 20:('[Si]',0),
36 21:('[C,c]=[C,c](~[!C;!c;!#1])~[!C;!c;!#1]',0),
37 22:('*1~*~*~1',0),
38 23:('[N,n]~[C,c](~[O,o])~[O,o]',0),
39 24:('[N,n]-[O,o]',0),
40 25:('[N,n]~[C,c](~[N,n])~[N,n]',0),
41 26:('[C,c]=;@[C,c](@*)@*',0),
42 27:('[I]',0),
43 28:('[!C;!c;!#1]~[CH2]~[!C;!c;!#1]',0),
44 29:('P',0),
45 30:('[C,c]~[!C;!c;!#1](~[C,c])(~[C,c])~*',0),
46 31:('[!C;!c;!#1]~[F,Cl,Br,I]',0),
47 32:('[C,c]~[S,s]~[N,n]',0),
48 33:('[N,n]~[S,s]',0),
49 34:('[CH2]=*',0),
50 35:('[Li,Na,K,Rb,Cs,Fr]',0),
51 36:('[$(S@*),$(s@*)]',0),
52 37:('[N,n]~[C,c](~[O,o])~[N,n]',0),
53 38:('[N,n]~[C,c](~[C,c])~[N,n]',0),
54 39:('[O,o]~[S,s](~[O,o])~[O,o]',0),
55 40:('[S,s]-[O,o]',0),
56 41:('[C,c]#[N,n]',0),
57 42:('F',0),
58 43:('[!C;!c;!#1;H,H2,H3,H4]~*~[!C;!c;!#1;H,H2,H3,H4]',0),
59 44:('?',0),
60 45:('[C,c]=[C,c]~[N,n]',0),
61 46:('Br',0),
62 47:('[S,s]~*~[N,n]',0),
63 48:('[O,o]~[!C;!c;!#1](~[O,o])(~[O,o])~*',0),
64 49:('[-,-2,-3,-4,+,+2,+3,+4]',0),
65 50:('[C,c]=[C,c](~[C,c])~[C,c]',0),
66 51:('[C,c]~[S,s]~[O,o]',0),
67 52:('[N,n]~[N,n]',0),
68 53:('[!C;!c;!#1;H,H2,H3,H4]~*~*~*~[!C;!c;!#1;H,H2,H3,H4]',0),
69 54:('[!C;!c;!#1;H,H2,H3,H4]~*~*~[!C;!c;!#1;H,H2,H3,H4]',0),
70 55:('[O,o]~[S,s]~[O,o]',0),
71 56:('[O,o]~[N,n](~[O,o])~[C,c]',0),
72 57:('[$(O@*),$(o@*)]',0),
73 58:('[!C;!c;!#1]~[S,s]~[!C;!c;!#1]',0),
74 59:('[S,s]!:*:*',0),
75 60:('[S,s]=[O,o]',0),
76 61:('*~[S,s](~*)~*',0),
77 62:('*@*!@*@*',0),
78 63:('[N,n]=[O,o]',0),
79 64:('*@*!@[S,s]',0),
80 65:('[C,c]:[N,n]',0),
81 66:('[C,c]~[C,c](~[C,c])(~[C,c])~*',0),
82 67:('[!C;!c;!#1]~[S,s]',0),
83 68:('[!C;!c;!#1;H,H2,H3,H4]~[!C;!c;!#1;H,H2,H3,H4]',0),
84 69:('[!C;!c;!#1]~[!C;!c;!#1;H,H2,H3,H4]',0),
85 70:('[!C;!c;!#1]~[N,n]~[!C;!c;!#1]',0),
86 71:('[N,n]~[O,o]',0),
87 72:('[O,o]~*~*~[O,o]',0),
88 73:('[S,s]=*',0),
89 74:('[CH3]~*~[CH3]',0),
90 75:('*!@[N,n]@*',0),
91 76:('[C,c]=[C,c](~*)~*',0),
92 77:('[N,n]~*~[N,n]',0),
93 78:('[C,c]=[N,n]',0),
94 79:('[N,n]~*~*~[N,n]',0),
95 80:('[N,n]~*~*~*~[N,n]',0),
96 81:('[S,s]~*(~*)~*',0),
97 82:('*~[CH2]~[!C;!c;!#1;H,H2,H3,H4]',0),
98 83:('[!C;!c;!#1]1~*~*~*~*~1',0),
99 84:('[NH2]',0),
100 85:('[C,c]~[N,n](~[C,c])~[C,c]',0),
101 86:('[CH2][!C;!c;!#1][CH2]',0),
102 87:('[F,Cl,Br,I]!@*@*',0),
103 88:('[S,s]',0),
104 89:('[O,o]~*~*~*~[O,o]',0),
105 90:('[!C;!c;!#1;H,H2,H3,H4]~*~*~[CH2]~*',0),
106 91:('[!C;!c;!#1;H,H2,H3,H4]~*~*~*~[CH2]~*',0),
107 92:('[O,o]~[C,c](~[N,n])~[C,c]',0),
108 93:('[!C;!c;!#1]~[CH3]',0),
109 94:('[!C;!c;!#1]~[N,n]',0),
110 95:('[N,n]~*~*~[O,o]',0),
111 96:('*1~*~*~*~*~1',0),
112 97:('[N,n]~*~*~*~[O,o]',0),
113 98:('[!C;!c;!#1]1~*~*~*~*~*~1',0),
114 99:('[C,c]=[C,c]',0),
115 100:('*~[CH2]~[N,n]',0),
116 101:('[r8,r9,r10,r11,r12]',0),
117 102:('[!C;!c;!#1]~[O,o]',0),
118 103:('Cl',0),
119 104:('[!C;!c;!#1;H,H2,H3,H4]~*~[CH2]~*',0),
120 105:('[!C;!c;!#1]@*(@*)@*',0),
121 106:('[!C;!c;!#1]~*(~[!C;!c;!#1])~[!C;!c;!#1]',0),
122 107:('[F,Cl,Br,I]~*(~*)~*',0),
123 108:('[CH3]~*~*~*~[CH2]~*',0),
124 109:('*~[CH2]~[O,o]',0),
125 110:('[N,n]~[C,c]~[O,o]',0),
126 111:('[N,n]~*~[CH2]~*',0),
127 112:('*~*(~*)(~*)~*',0),
128 113:('[O,o]!:*:*',0),
129 114:('[CH3]~[CH2]~*',0),
130 115:('[CH3]~*~[CH2]~*',0),
131 116:('[CH3]~*~*~[CH2]~*',0),
132 117:('[N,n]~*~[O,o]',0),
133 118:('*~[CH2]~[CH2]~*',1),
134 119:('[N,n]=*',0),
135 120:('[!C;!c;R]',1),
136 121:('[$(N@*),$(n@*)]',0),
137 122:('*~[N,n](~*)~*',0),
138 123:('[O,o]~[C,c]~[O,o]',0),
139 124:('[!C;!c;!#1]~[!C;!c;!#1]',0),
140 125:('?',0),
141 126:('*!@[O,o]!@*',0),
142 127:('*@*!@[O,o]',1),
143 128:('*~[CH2]~*~*~*~[CH2]~*',0),
144 129:('*~[CH2]~*~*~[CH2]~*',0),
145 130:('[!C;!c;!#1]~[!C;!c;!#1]',1),
146 131:('[!C;!c;!#1;H,H2,H3,H4]',1),
147 132:('[O,o]~*~[CH2]~*',0),
148 133:('*@*!:[N,n]',0),
149 134:('[F,Cl,Br,I]',0),
150 135:('[N,n]!:*:*',0),
151 136:('[O,o]=*',1),
152 137:('[!C;!c;R]',0),
153 138:('[!C;!c;!#1]~[CH2]~*',1),
154 139:('[OH,OH2,OH3]',0),
155 140:('[O,o]',3),
156 141:('[CH3]',2),
157 142:('[N,n]',1),
158 143:('*@*!@[O,o]',0),
159 144:('*!:*:*!:*',0),
160 145:('*1~*~*~*~*~*~1',1),
161 146:('[O,o]',2),
162 147:('*~[CH2]~[CH2]~*',0),
163 148:('*~[!C;!c;!#1](~*)~*',0),
164 149:('[CH3]',1),
165 150:('*!@*@*!@*',0),
166 151:('[NH,NH2,NH3,NH4]',0),
167 152:('[O,o]~[C,c](~[C,c])~[C,c]',0),
168 153:('[!C;!c;!#1]~[CH2]~*',0),
169 154:('[C,c]=[O,o]',0),
170 155:('*!@[CH2]!@*',0),
171 156:('[N,n]~*(~*)~*',0),
172 157:('[C,c]-[O,o]',0),
173 158:('[C,c]-[N,n]',0),
174 159:('[O,o]',1),
175 160:('[CH3]',0),
176 161:('[N,n]',0),
177 162:('a',0),
178 163:('*1~*~*~*~*~*~1',0),
179 164:('[O,o]',0),
180 165:('[R]',0),
181 166:('?',0),
182 }
183
184 maccsKeys = None
185
187 """ *Internal Use Only*
188
189 generates SMARTS patterns for the keys, run once
190
191 """
192 assert len(keyList) == len(keyDict.keys()),'length mismatch'
193 for key in keyDict.keys():
194 patt,count = keyDict[key]
195 if patt != '?':
196 try:
197 sma = Chem.MolFromSmarts(patt)
198 except:
199 sma = None
200 if not sma:
201 print 'SMARTS parser error for key #%d: %s'%(key,patt)
202 else:
203 keyList[key-1] = sma,count
204
206 """ generates the MACCS fingerprint for a molecules
207
208 **Arguments**
209
210 - mol: the molecule to be fingerprinted
211
212 - any extra keyword arguments are ignored
213
214 **Returns**
215
216 a _DataStructs.SparseBitVect_ containing the fingerprint.
217
218 >>> m = Chem.MolFromSmiles('CNO')
219 >>> bv = GenMACCSKeys(m)
220 >>> tuple(bv.GetOnBits())
221 (24, 68, 69, 71, 93, 94, 102, 124, 131, 139, 151, 158, 160, 161, 164)
222 >>> bv = GenMACCSKeys(Chem.MolFromSmiles('CCC'))
223 >>> tuple(bv.GetOnBits())
224 (74, 114, 149, 155, 160)
225
226 """
227 global maccsKeys
228 if maccsKeys is None:
229 maccsKeys = [(None,0)]*len(smartsPatts.keys())
230 _InitKeys(maccsKeys,smartsPatts)
231
232 res = DataStructs.SparseBitVect(len(maccsKeys)+1)
233 for i,(patt,count) in enumerate(maccsKeys):
234 if patt is not None:
235 matches = mol.GetSubstructMatches(patt)
236 if matches:
237 if count == 0:
238 res[i+1] = 1
239 else:
240 if len(matches) > count:
241 res[i+1] = 1
242 return res
243
244 FingerprintMol = GenMACCSKeys
245
246
247
248
249
251 import doctest,sys
252 return doctest.testmod(sys.modules["__main__"])
253
254 if __name__ == '__main__':
255 import sys
256 failed,tried = _test()
257 sys.exit(failed)
258