RDKit
Open-source cheminformatics and machine learning.
MorganFingerprints.h
Go to the documentation of this file.
1 //
2 //
3 // Copyright (c) 2009-2010, Novartis Institutes for BioMedical Research Inc.
4 // All rights reserved.
5 //
6 // Redistribution and use in source and binary forms, with or without
7 // modification, are permitted provided that the following conditions are
8 // met:
9 //
10 // * Redistributions of source code must retain the above copyright
11 // notice, this list of conditions and the following disclaimer.
12 // * Redistributions in binary form must reproduce the above
13 // copyright notice, this list of conditions and the following
14 // disclaimer in the documentation and/or other materials provided
15 // with the distribution.
16 // * Neither the name of Novartis Institutes for BioMedical Research Inc.
17 // nor the names of its contributors may be used to endorse or promote
18 // products derived from this software without specific prior written
19 // permission.
20 //
21 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 //
33 // Created by Greg Landrum, July 2008
34 //
35 //
36 
37 /*! \file MorganFingerprints.h
38 
39 */
40 #ifndef __RD_MORGANFPS_H__
41 #define __RD_MORGANFPS_H__
42 
43 #include <vector>
44 #include <map>
47 #include <boost/cstdint.hpp>
48 
49 namespace RDKit {
50 class ROMol;
51 namespace MorganFingerprints {
52 extern std::vector<std::string> defaultFeatureSmarts;
53 
54 typedef std::map<boost::uint32_t,
55  std::vector<std::pair<boost::uint32_t, boost::uint32_t> > >
57 
58 const std::string morganFingerprintVersion = "1.0.0";
59 
60 //! returns the Morgan fingerprint for a molecule
61 /*!
62  These fingerprints are similar to the well-known ECFP or
63  FCFP fingerprints, depending on which invariants are used.
64 
65  The algorithm used is described in the paper
66  Rogers, D. & Hahn, M. Extended-Connectivity Fingerprints. JCIM 50:742-54
67  (2010)
68  http://dx.doi.org/10.1021/ci100050t
69 
70  The original implementation was done using this paper:
71  D. Rogers, R.D. Brown, M. Hahn J. Biomol. Screen. 10:682-6 (2005)
72  and an unpublished technical report:
73  http://www.ics.uci.edu/~welling/teaching/ICS274Bspring06/David%20Rogers%20-%20ECFP%20Manuscript.doc
74 
75  \param mol: the molecule to be fingerprinted
76  \param radius: the number of iterations to grow the fingerprint
77  \param invariants : optional pointer to a set of atom invariants to
78  be used. By default ECFP-type invariants are used
79  (calculated by getConnectivityInvariants())
80  \param fromAtoms : if this is provided, only the atoms in the vector will be
81  used as centers in the fingerprint
82  \param useChirality : if set, additional information will be added to the
83  fingerprint
84  when chiral atoms are discovered. This will cause
85  \verbatim C[C@H](F)Cl,
86  C[C@@H](F)Cl, and CC(F)Cl \endverbatim to generate
87  different fingerprints.
88  \param useBondTypes : if set, bond types will be included as part of the hash
89  for
90  calculating bits
91  \param useCounts : if set, counts of the features will be used
92  \param onlyNonzeroInvariants : if set, bits will only be set from atoms that
93  have a nonzero invariant.
94  \param atomsSettingBits : if nonzero, this will be used to return information
95  about the atoms that set each particular bit.
96  The keys are the map are bit ids, the values
97  are lists of (atomId, radius) pairs.
98 
99  \return a pointer to the fingerprint. The client is
100  responsible for calling delete on this.
101 
102 */
104  const ROMol &mol, unsigned int radius,
105  std::vector<boost::uint32_t> *invariants = 0,
106  const std::vector<boost::uint32_t> *fromAtoms = 0,
107  bool useChirality = false, bool useBondTypes = true, bool useCounts = true,
108  bool onlyNonzeroInvariants = false, BitInfoMap *atomsSettingBits = 0);
109 
110 //! returns the Morgan fingerprint for a molecule
111 /*!
112  These fingerprints are similar to the well-known ECFP or
113  FCFP fingerprints, depending on which invariants are used.
114 
115  The algorithm used is described in the paper
116  Rogers, D. & Hahn, M. Extended-Connectivity Fingerprints. JCIM 50:742-54
117  (2010)
118  http://dx.doi.org/10.1021/ci100050t
119 
120  The original implementation was done using this paper:
121  D. Rogers, R.D. Brown, M. Hahn J. Biomol. Screen. 10:682-6 (2005)
122  and an unpublished technical report:
123  http://www.ics.uci.edu/~welling/teaching/ICS274Bspring06/David%20Rogers%20-%20ECFP%20Manuscript.doc
124 
125  \param mol: the molecule to be fingerprinted
126  \param radius: the number of iterations to grow the fingerprint
127  \param invariants : optional pointer to a set of atom invariants to
128  be used. By default ECFP-type invariants are used
129  (calculated by getConnectivityInvariants())
130  \param fromAtoms : if this is provided, only the atoms in the vector will be
131  used as centers in the fingerprint
132  \param useChirality : if set, additional information will be added to the
133  fingerprint
134  when chiral atoms are discovered. This will cause
135  \verbatim C[C@H](F)Cl,
136  C[C@@H](F)Cl, and CC(F)Cl \endverbatim to generate
137  different fingerprints.
138  \param useBondTypes : if set, bond types will be included as part of the hash
139  for
140  calculating bits
141  \param onlyNonzeroInvariants : if set, bits will only be set from atoms that
142  have a nonzero invariant.
143  \param atomsSettingBits : if nonzero, this will be used to return information
144  about the atoms that set each particular bit.
145  The keys are the map are bit ids, the values
146  are lists of (atomId, radius) pairs.
147 
148  \return a pointer to the fingerprint. The client is
149  responsible for calling delete on this.
150 
151 */
153  const ROMol &mol, unsigned int radius, unsigned int nBits = 2048,
154  std::vector<boost::uint32_t> *invariants = 0,
155  const std::vector<boost::uint32_t> *fromAtoms = 0,
156  bool useChirality = false, bool useBondTypes = true,
157  bool onlyNonzeroInvariants = false, BitInfoMap *atomsSettingBits = 0);
158 
159 //! returns the Morgan fingerprint for a molecule as a bit vector
160 /*!
161  see documentation for getFingerprint() for theory/references
162 
163  \param mol: the molecule to be fingerprinted
164  \param radius: the number of iterations to grow the fingerprint
165  \param nBits: the number of bits in the final fingerprint
166  \param invariants : optional pointer to a set of atom invariants to
167  be used. By default ECFP-type invariants are used
168  (calculated by getConnectivityInvariants())
169  \param fromAtoms : if this is provided, only the atoms in the vector will be
170  used as centers in the fingerprint
171  \param useChirality : if set, additional information will be added to the
172  fingerprint
173  when chiral atoms are discovered. This will cause
174  \verbatim C[C@H](F)Cl,
175  C[C@@H](F)Cl, and CC(F)Cl \endverbatim to generate
176  different fingerprints.
177  \param useBondTypes : if set, bond types will be included as part of the hash
178  for
179  calculating bits
180  \param onlyNonzeroInvariants : if set, bits will only be set from atoms that
181  have a nonzero invariant.
182  \param atomsSettingBits : if nonzero, this will be used to return information
183  about the atoms that set each particular bit.
184  The keys are the map are bit ids, the values
185  are lists of (atomId, radius) pairs.
186 
187  \return a pointer to the fingerprint. The client is
188  responsible for calling delete on this.
189 
190 */
192  const ROMol &mol, unsigned int radius, unsigned int nBits,
193  std::vector<boost::uint32_t> *invariants = 0,
194  const std::vector<boost::uint32_t> *fromAtoms = 0,
195  bool useChirality = false, bool useBondTypes = true,
196  bool onlyNonzeroInvariants = false, BitInfoMap *atomsSettingBits = 0);
197 
198 //! returns the connectivity invariants for a molecule
199 /*!
200 
201  \param mol : the molecule to be considered
202  \param invars : used to return the results
203  \param includeRingMembership : if set, whether or not the atom is in
204  a ring will be used in the invariant list.
205 */
206 void getConnectivityInvariants(const ROMol &mol,
207  std::vector<boost::uint32_t> &invars,
208  bool includeRingMembership = true);
209 const std::string morganConnectivityInvariantVersion = "1.0.0";
210 
211 //! returns the feature invariants for a molecule
212 /*!
213 
214  \param mol: the molecule to be considered
215  \param invars : used to return the results
216  \param patterns: if provided should contain the queries used to assign
217  atom-types.
218  if not provided, feature definitions adapted from reference:
219  Gobbi and Poppinger, Biotech. Bioeng. _61_ 47-54 (1998)
220  will be used for Donor, Acceptor, Aromatic, Halogen, Basic,
221  Acidic
222 
223 */
224 void getFeatureInvariants(const ROMol &mol,
225  std::vector<boost::uint32_t> &invars,
226  std::vector<const ROMol *> *patterns = 0);
227 const std::string morganFeatureInvariantVersion = "0.1.0";
228 
229 } // end of namespace MorganFingerprints
230 }
231 
232 #endif
ExplicitBitVect * getFingerprintAsBitVect(const ROMol &mol, unsigned int radius, unsigned int nBits, std::vector< boost::uint32_t > *invariants=0, const std::vector< boost::uint32_t > *fromAtoms=0, bool useChirality=false, bool useBondTypes=true, bool onlyNonzeroInvariants=false, BitInfoMap *atomsSettingBits=0)
returns the Morgan fingerprint for a molecule as a bit vector
const std::string morganFingerprintVersion
ROMol is a molecule class that is intended to have a fixed topology.
Definition: ROMol.h:103
const std::string morganFeatureInvariantVersion
SparseIntVect< boost::uint32_t > * getHashedFingerprint(const ROMol &mol, unsigned int radius, unsigned int nBits=2048, std::vector< boost::uint32_t > *invariants=0, const std::vector< boost::uint32_t > *fromAtoms=0, bool useChirality=false, bool useBondTypes=true, bool onlyNonzeroInvariants=false, BitInfoMap *atomsSettingBits=0)
returns the Morgan fingerprint for a molecule
std::map< boost::uint32_t, std::vector< std::pair< boost::uint32_t, boost::uint32_t > > > BitInfoMap
std::vector< std::string > defaultFeatureSmarts
void getFeatureInvariants(const ROMol &mol, std::vector< boost::uint32_t > &invars, std::vector< const ROMol * > *patterns=0)
returns the feature invariants for a molecule
const std::string morganConnectivityInvariantVersion
Std stuff.
Definition: Atom.h:29
SparseIntVect< boost::uint32_t > * getFingerprint(const ROMol &mol, unsigned int radius, std::vector< boost::uint32_t > *invariants=0, const std::vector< boost::uint32_t > *fromAtoms=0, bool useChirality=false, bool useBondTypes=true, bool useCounts=true, bool onlyNonzeroInvariants=false, BitInfoMap *atomsSettingBits=0)
returns the Morgan fingerprint for a molecule
void getConnectivityInvariants(const ROMol &mol, std::vector< boost::uint32_t > &invars, bool includeRingMembership=true)
returns the connectivity invariants for a molecule
a class for efficiently storing sparse vectors of ints
Definition: SparseIntVect.h:27
a class for bit vectors that are densely occupied