RDKit
Open-source cheminformatics and machine learning.
AtomPairs.h
Go to the documentation of this file.
1 //
2 // Copyright (C) 2007-2013 Greg Landrum
3 //
4 // @@ All Rights Reserved @@
5 // This file is part of the RDKit.
6 // The contents are covered by the terms of the BSD license
7 // which is included in the file license.txt, found at the root
8 // of the RDKit source tree.
9 //
10 
11 /*! \file AtomPairs.h
12 
13 
14  A few quick notes about fingerprint size and the way chirality is handled in
15  these functions.
16 
17  By default the atom-pair and topologic-torsion fingerprints do not include any
18  information about
19  chirality; the atom invariants only include information about the atomic
20  number,
21  number of pi electrons, and degree.
22  When chirality is included, two additional bits are added to the atom
23  invariants to flag R/S/no
24  chirality. These additional bits change the size of the atom invariants and
25  either the size
26  of the final fingerprint (atom pairs) or the maximum allowed path length
27  (torsions). This means
28  that even fingerprints for achiral molecules are different when
29  includeChirality is true.
30 
31 */
32 #ifndef __RD_ATOMPAIRS_H__
33 #define __RD_ATOMPAIRS_H__
34 
36 #include <DataStructs/BitVects.h>
37 #include <boost/cstdint.hpp>
38 namespace RDKit {
39 class Atom;
40 
41 namespace AtomPairs {
42 const std::string atomPairsVersion = "1.1.0";
43 const unsigned int numTypeBits = 4;
44 const unsigned int atomNumberTypes[1 << numTypeBits] = {
45  5, 6, 7, 8, 9, 14, 15, 16, 17, 33, 34, 35, 51, 52, 43};
46 const unsigned int numPiBits = 2;
47 const unsigned int maxNumPi = (1 << numPiBits) - 1;
48 const unsigned int numBranchBits = 3;
49 const unsigned int maxNumBranches = (1 << numBranchBits) - 1;
50 const unsigned int numChiralBits = 2;
51 const unsigned int codeSize = numTypeBits + numPiBits + numBranchBits;
52 const unsigned int numPathBits = 5;
53 const unsigned int maxPathLen = (1 << numPathBits) - 1;
54 const unsigned int numAtomPairFingerprintBits =
55  numPathBits + 2 * codeSize; // note that this is only accurate if chirality
56  // is not included
57 
58 //! returns a numeric code for the atom (the atom's hash in the
59 //! atom-pair scheme)
60 /*!
61  \param atom the atom to be considered
62  \param branchSubtract (optional) a constant to subtract from
63  the number of neighbors when the hash
64  is calculated (used in the topological
65  torsions code)
66  \param includeChirality toggles the inclusions of bits indicating R/S
67  chirality
68 */
69 boost::uint32_t getAtomCode(const Atom *atom, unsigned int branchSubtract = 0,
70  bool includeChirality = false);
71 
72 //! returns an atom pair hash based on two atom hashes and the
73 //! distance between the atoms.
74 /*!
75  \param codeI the hash for the first atom
76  \param codeJ the hash for the second atom
77  \param dist the distance (number of bonds) between the two
78  atoms
79  \param includeChirality toggles the inclusions of bits indicating R/S
80  chirality
81 */
82 boost::uint32_t getAtomPairCode(boost::uint32_t codeI, boost::uint32_t codeJ,
83  unsigned int dist,
84  bool includeChirality = false);
85 
86 //! returns the atom-pair fingerprint for a molecule
87 /*!
88  The algorithm used is described here:
89  R.E. Carhart, D.H. Smith, R. Venkataraghavan; "Atom Pairs as
90  Molecular Features in Structure-Activity Studies: Definition
91  and Applications" JCICS 25, 64-73 (1985).
92 
93 
94  \param mol: the molecule to be fingerprinted
95  \param minLength: minimum distance between atoms to be
96  considered in a pair. Default is 1 bond.
97  \param maxLength: maximum distance between atoms to be
98  considered in a pair.
99  Default is maxPathLen-1 bonds.
100  \param fromAtoms: if provided, only atom pairs that involve
101  the specified atoms will be included in the
102  fingerprint
103  \param ignoreAtoms: if provided, any atom pairs that include
104  the specified atoms will not be included in the
105  fingerprint
106  \param atomInvariants: a list of invariants to use for the atom hashes
107  note: only the first \c codeSize bits of each
108  invariant are used.
109  \param includeChirality: if set, chirality will be used in the atom invariants
110  (note: this is ignored if atomInvariants are
111  provided)
112  \param use2D: if set, the 2D (topological) distance matrix is used.
113  \param confId: the conformation to use if 3D distances are being used
114 
115 
116  \return a pointer to the fingerprint. The client is
117  responsible for calling delete on this.
118 
119 */
121  const ROMol &mol, unsigned int minLength, unsigned int maxLength,
122  const std::vector<boost::uint32_t> *fromAtoms = 0,
123  const std::vector<boost::uint32_t> *ignoreAtoms = 0,
124  const std::vector<boost::uint32_t> *atomInvariants = 0,
125  bool includeChirality = false, bool use2D = true, int confId = -1);
126 //! \overload
128  const ROMol &mol, const std::vector<boost::uint32_t> *fromAtoms = 0,
129  const std::vector<boost::uint32_t> *ignoreAtoms = 0,
130  const std::vector<boost::uint32_t> *atomInvariants = 0,
131  bool includeChirality = false, bool use2D = true, int confId = -1);
132 
133 //! returns the hashed atom-pair fingerprint for a molecule
134 /*!
135  \param mol: the molecule to be fingerprinted
136  \param nBits: the length of the fingerprint to generate
137  \param minLength: minimum distance between atoms to be
138  considered in a pair. Default is 1 bond.
139  \param maxLength: maximum distance between atoms to be
140  considered in a pair.
141  Default is maxPathLen-1 bonds.
142  \param fromAtoms: if provided, only atom pairs that involve
143  the specified atoms will be included in the
144  fingerprint
145  \param ignoreAtoms: if provided, any atom pairs that include
146  the specified atoms will not be included in the
147  fingerprint
148  \param atomInvariants: a list of invariants to use for the atom hashes
149  note: only the first \c codeSize bits of each
150  invariant are used.
151  \param includeChirality: if set, chirality will be used in the atom invariants
152  (note: this is ignored if atomInvariants are
153  provided)
154  \param use2D: if set, the 2D (topological) distance matrix is used.
155 
156  \return a pointer to the fingerprint. The client is
157  responsible for calling delete on this.
158 
159 */
161  const ROMol &mol, unsigned int nBits = 2048, unsigned int minLength = 1,
162  unsigned int maxLength = maxPathLen - 1,
163  const std::vector<boost::uint32_t> *fromAtoms = 0,
164  const std::vector<boost::uint32_t> *ignoreAtoms = 0,
165  const std::vector<boost::uint32_t> *atomInvariants = 0,
166  bool includeChirality = false, bool use2D = true, int confId = -1);
167 //! returns the hashed atom-pair fingerprint for a molecule as a bit vector
168 /*!
169  \param mol: the molecule to be fingerprinted
170  \param nBits: the length of the fingerprint to generate
171  \param minLength: minimum distance between atoms to be
172  considered in a pair. Default is 1 bond.
173  \param maxLength: maximum distance between atoms to be
174  considered in a pair.
175  Default is maxPathLen-1 bonds.
176  \param fromAtoms: if provided, only atom pairs that involve
177  the specified atoms will be included in the
178  fingerprint
179  \param ignoreAtoms: if provided, any atom pairs that include
180  the specified atoms will not be included in the
181  fingerprint
182  \param atomInvariants: a list of invariants to use for the atom hashes
183  note: only the first \c codeSize bits of each
184  invariant are used.
185  \param nBitsPerEntry: number of bits to use in simulating counts
186  \param includeChirality: if set, chirality will be used in the atom invariants
187  (note: this is ignored if atomInvariants are
188  provided)
189  \param use2D: if set, the 2D (topological) distance matrix is used.
190  \param confId: the conformation to use if 3D distances are being used
191 
192  \return a pointer to the fingerprint. The client is
193  responsible for calling delete on this.
194 
195 */
197  const ROMol &mol, unsigned int nBits = 2048, unsigned int minLength = 1,
198  unsigned int maxLength = maxPathLen - 1,
199  const std::vector<boost::uint32_t> *fromAtoms = 0,
200  const std::vector<boost::uint32_t> *ignoreAtoms = 0,
201  const std::vector<boost::uint32_t> *atomInvariants = 0,
202  unsigned int nBitsPerEntry = 4, bool includeChirality = false,
203  bool use2D = true, int confId = -1);
204 
205 //! returns an topological torsion hash based on the atom hashes
206 //! passed in
207 /*!
208  \param atomCodes the vector of atom hashes
209 */
210 boost::uint64_t getTopologicalTorsionCode(
211  const std::vector<boost::uint32_t> &atomCodes,
212  bool includeChirality = false);
213 
214 //! returns the topological-torsion fingerprint for a molecule
215 /*!
216  The algorithm used is described here:
217  R. Nilakantan, N. Bauman, J. S. Dixon, R. Venkataraghavan;
218  "Topological Torsion: A New Molecular Descriptor for SAR Applications.
219  Comparison with Other Descriptors" JCICS 27, 82-85 (1987).
220 
221  \param mol: the molecule to be fingerprinted
222  \param targetSize: the number of atoms to include in the "torsions"
223  \param fromAtoms: if provided, only torsions that start or end at
224  the specified atoms will be included in the
225  fingerprint
226  \param ignoreAtoms: if provided, any torsions that include
227  the specified atoms will not be included in the
228  fingerprint
229  \param atomInvariants: a list of invariants to use for the atom hashes
230  note: only the first \c codeSize bits of each
231  invariant are used.
232  \param includeChirality: if set, chirality will be used in the atom invariants
233  (note: this is ignored if atomInvariants are
234  provided)
235 
236  \return a pointer to the fingerprint. The client is
237  responsible for calling delete on this.
238 
239 */
241  const ROMol &mol, unsigned int targetSize = 4,
242  const std::vector<boost::uint32_t> *fromAtoms = 0,
243  const std::vector<boost::uint32_t> *ignoreAtoms = 0,
244  const std::vector<boost::uint32_t> *atomInvariants = 0,
245  bool includeChirality = false);
246 //! returns a hashed topological-torsion fingerprint for a molecule
247 /*!
248  The algorithm used is described here:
249  R. Nilakantan, N. Bauman, J. S. Dixon, R. Venkataraghavan;
250  "Topological Torsion: A New Molecular Descriptor for SAR Applications.
251  Comparison with Other Descriptors" JCICS 27, 82-85 (1987).
252 
253  \param mol: the molecule to be fingerprinted
254  \param nBits: number of bits to include in the fingerprint
255  \param targetSize: the number of atoms to include in the "torsions"
256  \param fromAtoms: if provided, only torsions that start or end at
257  the specified atoms will be included in the
258  fingerprint
259  \param ignoreAtoms: if provided, any torsions that include
260  the specified atoms will not be included in the
261  fingerprint
262  \param atomInvariants: a list of invariants to use for the atom hashes
263  note: only the first \c codeSize bits of each
264  invariant are used.
265  \param includeChirality: if set, chirality will be used in the atom invariants
266  (note: this is ignored if atomInvariants are
267  provided)
268 
269  \return a pointer to the fingerprint. The client is
270  responsible for calling delete on this.
271 
272 */
274  const ROMol &mol, unsigned int nBits = 2048, unsigned int targetSize = 4,
275  const std::vector<boost::uint32_t> *fromAtoms = 0,
276  const std::vector<boost::uint32_t> *ignoreAtoms = 0,
277  const std::vector<boost::uint32_t> *atomInvariants = 0,
278  bool includeChirality = false);
279 //! returns a hashed topological-torsion fingerprint for a molecule as a bit
280 // vector
281 /*!
282  \param mol: the molecule to be fingerprinted
283  \param nBits: number of bits to include in the fingerprint
284  \param targetSize: the number of atoms to include in the "torsions"
285  \param fromAtoms: if provided, only torsions that start or end at
286  the specified atoms will be included in the
287  fingerprint
288  \param ignoreAtoms: if provided, any torsions that include
289  the specified atoms will not be included in the
290  fingerprint
291  \param atomInvariants: a list of invariants to use for the atom hashes
292  note: only the first \c codeSize bits of each
293  invariant are used.
294  \param nBitsPerEntry: number of bits to use in simulating counts
295  \param includeChirality: if set, chirality will be used in the atom invariants
296  (note: this is ignored if atomInvariants are
297  provided)
298 
299  \return a pointer to the fingerprint. The client is
300  responsible for calling delete on this.
301 
302 */
304  const ROMol &mol, unsigned int nBits = 2048, unsigned int targetSize = 4,
305  const std::vector<boost::uint32_t> *fromAtoms = 0,
306  const std::vector<boost::uint32_t> *ignoreAtoms = 0,
307  const std::vector<boost::uint32_t> *atomInvariants = 0,
308  unsigned int nBitsPerEntry = 4, bool includeChirality = false);
309 }
310 }
311 
312 #endif
const std::string atomPairsVersion
Definition: AtomPairs.h:42
Pulls in all the BitVect classes.
SparseIntVect< boost::int32_t > * getHashedAtomPairFingerprint(const ROMol &mol, unsigned int nBits=2048, unsigned int minLength=1, unsigned int maxLength=maxPathLen-1, const std::vector< boost::uint32_t > *fromAtoms=0, const std::vector< boost::uint32_t > *ignoreAtoms=0, const std::vector< boost::uint32_t > *atomInvariants=0, bool includeChirality=false, bool use2D=true, int confId=-1)
returns the hashed atom-pair fingerprint for a molecule
const unsigned int maxPathLen
Definition: AtomPairs.h:53
const unsigned int numPathBits
Definition: AtomPairs.h:52
const unsigned int maxNumPi
Definition: AtomPairs.h:47
ROMol is a molecule class that is intended to have a fixed topology.
Definition: ROMol.h:106
boost::uint32_t getAtomCode(const Atom *atom, unsigned int branchSubtract=0, bool includeChirality=false)
const unsigned int numPiBits
Definition: AtomPairs.h:46
Includes a bunch of functionality for handling Atom and Bond queries.
Definition: Atom.h:29
const unsigned int maxNumBranches
Definition: AtomPairs.h:49
SparseIntVect< boost::int64_t > * getTopologicalTorsionFingerprint(const ROMol &mol, unsigned int targetSize=4, const std::vector< boost::uint32_t > *fromAtoms=0, const std::vector< boost::uint32_t > *ignoreAtoms=0, const std::vector< boost::uint32_t > *atomInvariants=0, bool includeChirality=false)
returns the topological-torsion fingerprint for a molecule
const unsigned int numBranchBits
Definition: AtomPairs.h:48
boost::uint64_t getTopologicalTorsionCode(const std::vector< boost::uint32_t > &atomCodes, bool includeChirality=false)
ExplicitBitVect * getHashedTopologicalTorsionFingerprintAsBitVect(const ROMol &mol, unsigned int nBits=2048, unsigned int targetSize=4, const std::vector< boost::uint32_t > *fromAtoms=0, const std::vector< boost::uint32_t > *ignoreAtoms=0, const std::vector< boost::uint32_t > *atomInvariants=0, unsigned int nBitsPerEntry=4, bool includeChirality=false)
returns a hashed topological-torsion fingerprint for a molecule as a bit
const unsigned int numAtomPairFingerprintBits
Definition: AtomPairs.h:54
SparseIntVect< boost::int64_t > * getHashedTopologicalTorsionFingerprint(const ROMol &mol, unsigned int nBits=2048, unsigned int targetSize=4, const std::vector< boost::uint32_t > *fromAtoms=0, const std::vector< boost::uint32_t > *ignoreAtoms=0, const std::vector< boost::uint32_t > *atomInvariants=0, bool includeChirality=false)
returns a hashed topological-torsion fingerprint for a molecule
a class for efficiently storing sparse vectors of ints
Definition: SparseIntVect.h:27
a class for bit vectors that are densely occupied
boost::uint32_t getAtomPairCode(boost::uint32_t codeI, boost::uint32_t codeJ, unsigned int dist, bool includeChirality=false)
const unsigned int codeSize
Definition: AtomPairs.h:51
const unsigned int atomNumberTypes[1<< numTypeBits]
Definition: AtomPairs.h:44
The class for representing atoms.
Definition: Atom.h:68
ExplicitBitVect * getHashedAtomPairFingerprintAsBitVect(const ROMol &mol, unsigned int nBits=2048, unsigned int minLength=1, unsigned int maxLength=maxPathLen-1, const std::vector< boost::uint32_t > *fromAtoms=0, const std::vector< boost::uint32_t > *ignoreAtoms=0, const std::vector< boost::uint32_t > *atomInvariants=0, unsigned int nBitsPerEntry=4, bool includeChirality=false, bool use2D=true, int confId=-1)
returns the hashed atom-pair fingerprint for a molecule as a bit vector
SparseIntVect< boost::int32_t > * getAtomPairFingerprint(const ROMol &mol, unsigned int minLength, unsigned int maxLength, const std::vector< boost::uint32_t > *fromAtoms=0, const std::vector< boost::uint32_t > *ignoreAtoms=0, const std::vector< boost::uint32_t > *atomInvariants=0, bool includeChirality=false, bool use2D=true, int confId=-1)
returns the atom-pair fingerprint for a molecule
const unsigned int numChiralBits
Definition: AtomPairs.h:50
const unsigned int numTypeBits
Definition: AtomPairs.h:43