RDKit
Open-source cheminformatics and machine learning.
Loading...
Searching...
No Matches
MolStandardize/Tautomer.h
Go to the documentation of this file.
1//
2// Copyright (C) 2018-2021 Susan H. Leung and other RDKit contributors
3//
4// @@ All Rights Reserved @@
5// This file is part of the RDKit.
6// The contents are covered by the terms of the BSD license
7// which is included in the file license.txt, found at the root
8// of the RDKit source tree.
9//
10#include <RDGeneral/export.h>
11#ifndef RD_TAUTOMER_H
12#define RD_TAUTOMER_H
13
14#include <boost/function.hpp>
15#include <string>
16#include <utility>
17#include <iterator>
18#include <Catalogs/Catalog.h>
23#include <boost/dynamic_bitset.hpp>
24
25namespace RDKit {
26class ROMol;
27class RWMol;
28
29namespace MolStandardize {
30
31typedef RDCatalog::HierarchCatalog<TautomerCatalogEntry, TautomerCatalogParams,
32 int>
34
35namespace TautomerScoringFunctions {
36const std::string tautomerScoringVersion = "1.0.0";
37
38//! The SubstructTerm controls how Tautomers are generated
39/// Each Term is defined by a name, smarts pattern and score
40/// For example, the C=O term is defined as
41/// SubstructTerm("C=O", "[#6]=,:[#8]", 2)
42/// This gets a score of +2 for each Carbon doubly or aromatically
43/// Bonded to an Oxygen.
44/// For a list of current definitions, see getDefaultTautomerScoreSubstructs
46 std::string name;
47 std::string smarts;
48 int score;
49 RWMol matcher; // requires assignment
50
51 SubstructTerm(std::string aname, std::string asmarts, int ascore);
52 SubstructTerm(const SubstructTerm &rhs) = default;
53
54 bool operator==(const SubstructTerm &rhs) const {
55 return name == rhs.name && smarts == rhs.smarts && score == rhs.score;
56 }
57};
58
59//! getDefaultTautomerSubstructs returns the SubstructTerms used in scoring
60/// tautomer forms. See SubstructTerm for details.
61RDKIT_MOLSTANDARDIZE_EXPORT const std::vector<SubstructTerm>
63
64//! Score the rings of the current tautomer
65/// Aromatic rings score 100, all carbon aromatic rings score 250
66/*!
67 \param mol Molcule to score
68 \returns integer score for the molecule's rings
69*/
71
72//! scoreSubstructs scores the molecule based on the substructure definitions
73/*!
74 \param mol Molecule to score
75 \param terms Substruct Terms used for scoring this particular tautomer form
76 \returns integer score for the molecule's substructure terms
77*/
79 const ROMol &mol, const std::vector<SubstructTerm> &terms =
81//! scoreHeteroHs score the molecules hydrogens
82/// This gives a negative penalty to hydrogens attached to S,P, Se and Te
83/*!
84 \param mol Molecule to score
85 \returns integer score for the molecule hetero hydrogens
86*/
88
89inline int scoreTautomer(const ROMol &mol) {
90 return scoreRings(mol) + scoreSubstructs(mol) + scoreHeteroHs(mol);
91}
92} // namespace TautomerScoringFunctions
93
100
101class Tautomer {
102 friend class TautomerEnumerator;
103
104 public:
105 Tautomer() : d_numModifiedAtoms(0), d_numModifiedBonds(0), d_done(false) {}
106 Tautomer(ROMOL_SPTR t, ROMOL_SPTR k, size_t a = 0, size_t b = 0)
107 : tautomer(std::move(t)),
108 kekulized(std::move(k)),
109 d_numModifiedAtoms(a),
110 d_numModifiedBonds(b),
111 d_done(false) {}
114
115 private:
116 size_t d_numModifiedAtoms;
117 size_t d_numModifiedBonds;
118 bool d_done;
119};
120
121typedef std::map<std::string, Tautomer> SmilesTautomerMap;
122typedef std::pair<std::string, Tautomer> SmilesTautomerPair;
123
124//! Contains results of tautomer enumeration
126 friend class TautomerEnumerator;
127
128 public:
130 public:
132 typedef std::ptrdiff_t difference_type;
133 typedef const ROMol *pointer;
134 typedef const ROMOL_SPTR &reference;
135 typedef std::bidirectional_iterator_tag iterator_category;
136
137 explicit const_iterator(const SmilesTautomerMap::const_iterator &it)
138 : d_it(it) {}
139 reference operator*() const { return d_it->second.tautomer; }
140 pointer operator->() const { return d_it->second.tautomer.get(); }
141 bool operator==(const const_iterator &other) const {
142 return (d_it == other.d_it);
143 }
144 bool operator!=(const const_iterator &other) const {
145 return !(*this == other);
146 }
148 const_iterator copy(d_it);
149 operator++();
150 return copy;
151 }
153 ++d_it;
154 return *this;
155 }
157 const_iterator copy(d_it);
158 operator--();
159 return copy;
160 }
162 --d_it;
163 return *this;
164 }
165
166 private:
167 SmilesTautomerMap::const_iterator d_it;
168 };
171 : d_tautomers(other.d_tautomers),
172 d_status(other.d_status),
173 d_modifiedAtoms(other.d_modifiedAtoms),
174 d_modifiedBonds(other.d_modifiedBonds) {
175 fillTautomersItVec();
176 }
177 const const_iterator begin() const {
178 return const_iterator(d_tautomers.begin());
179 }
180 const const_iterator end() const { return const_iterator(d_tautomers.end()); }
181 size_t size() const { return d_tautomers.size(); }
182 bool empty() const { return d_tautomers.empty(); }
183 const ROMOL_SPTR &at(size_t pos) const {
184 PRECONDITION(pos < d_tautomers.size(), "index out of bounds");
185 return d_tautomersItVec.at(pos)->second.tautomer;
186 }
187 const ROMOL_SPTR &operator[](size_t pos) const { return at(pos); }
188 const boost::dynamic_bitset<> &modifiedAtoms() const {
189 return d_modifiedAtoms;
190 }
191 const boost::dynamic_bitset<> &modifiedBonds() const {
192 return d_modifiedBonds;
193 }
194 TautomerEnumeratorStatus status() const { return d_status; }
195 std::vector<ROMOL_SPTR> tautomers() const {
196 std::vector<ROMOL_SPTR> tautomerVec;
197 tautomerVec.reserve(d_tautomers.size());
198 std::transform(
199 d_tautomers.begin(), d_tautomers.end(), std::back_inserter(tautomerVec),
200 [](const SmilesTautomerPair &t) { return t.second.tautomer; });
201 return tautomerVec;
202 }
203 std::vector<ROMOL_SPTR> operator()() const { return tautomers(); }
204 std::vector<std::string> smiles() const {
205 std::vector<std::string> smilesVec;
206 smilesVec.reserve(d_tautomers.size());
207 std::transform(d_tautomers.begin(), d_tautomers.end(),
208 std::back_inserter(smilesVec),
209 [](const SmilesTautomerPair &t) { return t.first; });
210 return smilesVec;
211 }
212 const SmilesTautomerMap &smilesTautomerMap() const { return d_tautomers; }
213
214 private:
215 void fillTautomersItVec() {
216 for (auto it = d_tautomers.begin(); it != d_tautomers.end(); ++it) {
217 d_tautomersItVec.push_back(it);
218 }
219 }
220 // the enumerated tautomers
221 SmilesTautomerMap d_tautomers;
222 // internal; vector of iterators into map items to enable random
223 // access to map items by index
224 std::vector<SmilesTautomerMap::const_iterator> d_tautomersItVec;
225 // status of the enumeration: did it complete? did it hit a limit?
226 // was it canceled?
227 TautomerEnumeratorStatus d_status;
228 // bit vector: flags atoms modified by the transforms
229 boost::dynamic_bitset<> d_modifiedAtoms;
230 // bit vector: flags bonds modified by the transforms
231 boost::dynamic_bitset<> d_modifiedBonds;
232};
233
240
242 public:
244 : dp_catalog(tautCat),
245 d_maxTautomers(1000),
246 d_maxTransforms(1000),
247 d_removeSp3Stereo(true),
248 d_removeBondStereo(true),
249 d_removeIsotopicHs(true),
250 d_reassignStereo(true) {}
253 : dp_catalog(other.dp_catalog),
254 d_callback(other.d_callback),
255 d_maxTautomers(other.d_maxTautomers),
256 d_maxTransforms(other.d_maxTransforms),
257 d_removeSp3Stereo(other.d_removeSp3Stereo),
258 d_removeBondStereo(other.d_removeBondStereo),
259 d_removeIsotopicHs(other.d_removeIsotopicHs),
260 d_reassignStereo(other.d_reassignStereo) {}
262 if (this == &other) {
263 return *this;
264 }
265 dp_catalog = other.dp_catalog;
266 d_callback = other.d_callback;
267 d_maxTautomers = other.d_maxTautomers;
268 d_maxTransforms = other.d_maxTransforms;
269 d_removeSp3Stereo = other.d_removeSp3Stereo;
270 d_removeBondStereo = other.d_removeBondStereo;
271 d_removeIsotopicHs = other.d_removeIsotopicHs;
272 d_reassignStereo = other.d_reassignStereo;
273 return *this;
274 }
275 //! \param maxTautomers maximum number of tautomers to be generated
276 void setMaxTautomers(unsigned int maxTautomers) {
277 d_maxTautomers = maxTautomers;
278 }
279 //! \return maximum number of tautomers to be generated
280 unsigned int getMaxTautomers() { return d_maxTautomers; }
281 /*! \param maxTransforms maximum number of transformations to be applied
282 this limit is usually hit earlier than the maxTautomers limit
283 and leads to a more linear scaling of CPU time with increasing
284 number of tautomeric centers (see Sitzmann et al.)
285 */
286 void setMaxTransforms(unsigned int maxTransforms) {
287 d_maxTransforms = maxTransforms;
288 }
289 //! \return maximum number of transformations to be applied
290 unsigned int getMaxTransforms() { return d_maxTransforms; }
291 /*! \param removeSp3Stereo; if set to true, stereochemistry information
292 will be removed from sp3 atoms involved in tautomerism.
293 This means that S-aminoacids will lose their stereochemistry after going
294 through tautomer enumeration because of the amido-imidol tautomerism.
295 This defaults to true in RDKit, false in the workflow described
296 by Sitzmann et al.
297 */
298 void setRemoveSp3Stereo(bool removeSp3Stereo) {
299 d_removeSp3Stereo = removeSp3Stereo;
300 }
301 /*! \return whether stereochemistry information will be removed from
302 sp3 atoms involved in tautomerism
303 */
304 bool getRemoveSp3Stereo() { return d_removeSp3Stereo; }
305 /*! \param removeBondStereo; if set to true, stereochemistry information
306 will be removed from double bonds involved in tautomerism.
307 This means that enols will lose their E/Z stereochemistry after going
308 through tautomer enumeration because of the keto-enolic tautomerism.
309 This defaults to true in RDKit and also in the workflow described
310 by Sitzmann et al.
311 */
312 void setRemoveBondStereo(bool removeBondStereo) {
313 d_removeBondStereo = removeBondStereo;
314 }
315 /*! \return whether stereochemistry information will be removed from
316 double bonds involved in tautomerism
317 */
318 bool getRemoveBondStereo() { return d_removeBondStereo; }
319 /*! \param removeIsotopicHs; if set to true, isotopic Hs
320 will be removed from centers involved in tautomerism.
321 */
322 void setRemoveIsotopicHs(bool removeIsotopicHs) {
323 d_removeIsotopicHs = removeIsotopicHs;
324 }
325 /*! \return whether isotpoic Hs will be removed from
326 centers involved in tautomerism
327 */
328 bool getRemoveIsotopicHs() { return d_removeIsotopicHs; }
329 /*! \param reassignStereo; if set to true, assignStereochemistry
330 will be called on each tautomer generated by the enumerate() method.
331 This defaults to true.
332 */
333 void setReassignStereo(bool reassignStereo) {
334 d_reassignStereo = reassignStereo;
335 }
336 /*! \return whether assignStereochemistry will be called on each
337 tautomer generated by the enumerate() method
338 */
339 bool getReassignStereo() { return d_reassignStereo; }
340 /*! set this to an instance of a class derived from
341 TautomerEnumeratorCallback where operator() is overridden.
342 DO NOT delete the instance as ownership of the pointer is transferred
343 to the TautomerEnumerator
344 */
346 d_callback.reset(callback);
347 }
348 /*! \return pointer to an instance of a class derived from
349 TautomerEnumeratorCallback.
350 DO NOT delete the instance as ownership of the pointer is transferred
351 to the TautomerEnumerator
352 */
353 TautomerEnumeratorCallback *getCallback() const { return d_callback.get(); }
354
355 //! returns a \c TautomerEnumeratorResult structure for the input molecule
356 /*!
357 The enumeration rules are inspired by the publication:
358 M. Sitzmann et al., “Tautomerism in Large Databases.”, JCAMD 24:521 (2010)
359 https://doi.org/10.1007/s10822-010-9346-4
360
361 \param mol: the molecule to be enumerated
362
363 Note: the definitions used here are that the atoms modified during
364 tautomerization are the atoms at the beginning and end of each tautomer
365 transform (the H "donor" and H "acceptor" in the transform) and the bonds
366 modified during transformation are any bonds whose order is changed during
367 the tautomer transform (these are the bonds between the "donor" and the
368 "acceptor")
369
370 */
372
373 //! Deprecated, please use the form returning a \c TautomerEnumeratorResult
374 //! instead
375 [[deprecated(
376 "please use the form returning a TautomerEnumeratorResult "
377 "instead")]] std::vector<ROMOL_SPTR>
378 enumerate(const ROMol &mol, boost::dynamic_bitset<> *modifiedAtoms,
379 boost::dynamic_bitset<> *modifiedBonds = nullptr) const;
380
381 //! returns the canonical tautomer from a \c TautomerEnumeratorResult
383 boost::function<int(const ROMol &mol)> scoreFunc =
384 TautomerScoringFunctions::scoreTautomer) const;
385
386 //! returns the canonical tautomer from an iterable of possible tautomers
387 /// When Iterable is TautomerEnumeratorResult we use the other non-templated
388 /// overload for efficiency (TautomerEnumeratorResult already has SMILES so no
389 /// need to recompute them)
390 template <class Iterable,
391 typename std::enable_if<
392 !std::is_same<Iterable, TautomerEnumeratorResult>::value,
393 int>::type = 0>
394 ROMol *pickCanonical(const Iterable &tautomers,
395 boost::function<int(const ROMol &mol)> scoreFunc =
396 TautomerScoringFunctions::scoreTautomer) const {
397 ROMOL_SPTR bestMol;
398 if (tautomers.size() == 1) {
399 bestMol = *tautomers.begin();
400 } else {
401 // Calculate score for each tautomer
402 int bestScore = std::numeric_limits<int>::min();
403 std::string bestSmiles = "";
404 for (const auto &t : tautomers) {
405 auto score = scoreFunc(*t);
406#ifdef VERBOSE_ENUMERATION
407 std::cerr << " " << MolToSmiles(*t) << " " << score << std::endl;
408#endif
409 if (score > bestScore) {
410 bestScore = score;
411 bestSmiles = MolToSmiles(*t);
412 bestMol = t;
413 } else if (score == bestScore) {
414 auto smiles = MolToSmiles(*t);
415 if (smiles < bestSmiles) {
416 bestSmiles = smiles;
417 bestMol = t;
418 }
419 }
420 }
421 }
422 ROMol *res = new ROMol(*bestMol);
423 static const bool cleanIt = true;
424 static const bool force = true;
425 MolOps::assignStereochemistry(*res, cleanIt, force);
426
427 return res;
428 }
429
430 //! returns the canonical tautomer for a molecule
431 /*!
432 Note that the canonical tautomer is very likely not the most stable tautomer
433 for any given conditions. The default scoring rules are designed to produce
434 "reasonable" tautomers, but the primary concern is that the results are
435 canonical: you always get the same canonical tautomer for a molecule
436 regardless of what the input tautomer or atom ordering were.
437
438 The default scoring scheme is inspired by the publication:
439 M. Sitzmann et al., “Tautomerism in Large Databases.”, JCAMD 24:521 (2010)
440 https://doi.org/10.1007/s10822-010-9346-4
441
442 */
444 boost::function<int(const ROMol &mol)> scoreFunc =
445 TautomerScoringFunctions::scoreTautomer) const;
447 boost::function<int(const ROMol &mol)> scoreFunc =
448 TautomerScoringFunctions::scoreTautomer) const;
449
450 private:
451 bool setTautomerStereoAndIsoHs(const ROMol &mol, ROMol &taut,
452 const TautomerEnumeratorResult &res) const;
453 std::shared_ptr<TautomerCatalog> dp_catalog;
454 std::shared_ptr<TautomerEnumeratorCallback> d_callback;
455 unsigned int d_maxTautomers;
456 unsigned int d_maxTransforms;
457 bool d_removeSp3Stereo;
458 bool d_removeBondStereo;
459 bool d_removeIsotopicHs;
460 bool d_reassignStereo;
461}; // TautomerEnumerator class
462
463// caller owns the pointer
465 const CleanupParameters &params) {
466 return new TautomerEnumerator(params);
467}
468// caller owns the pointer
474
475} // namespace MolStandardize
476} // namespace RDKit
477
478#endif
#define PRECONDITION(expr, mess)
Definition Invariant.h:109
A Catalog with a hierarchical structure.
Definition Catalog.h:135
virtual bool operator()(const ROMol &, const TautomerEnumeratorResult &)=0
Contains results of tautomer enumeration.
const boost::dynamic_bitset & modifiedBonds() const
const boost::dynamic_bitset & modifiedAtoms() const
TautomerEnumeratorResult(const TautomerEnumeratorResult &other)
TautomerEnumerator(const CleanupParameters &params=CleanupParameters())
std::vector< ROMOL_SPTR > enumerate(const ROMol &mol, boost::dynamic_bitset<> *modifiedAtoms, boost::dynamic_bitset<> *modifiedBonds=nullptr) const
ROMol * pickCanonical(const TautomerEnumeratorResult &tautRes, boost::function< int(const ROMol &mol)> scoreFunc=TautomerScoringFunctions::scoreTautomer) const
returns the canonical tautomer from a TautomerEnumeratorResult
TautomerEnumeratorCallback * getCallback() const
ROMol * canonicalize(const ROMol &mol, boost::function< int(const ROMol &mol)> scoreFunc=TautomerScoringFunctions::scoreTautomer) const
returns the canonical tautomer for a molecule
void canonicalizeInPlace(RWMol &mol, boost::function< int(const ROMol &mol)> scoreFunc=TautomerScoringFunctions::scoreTautomer) const
TautomerEnumeratorResult enumerate(const ROMol &mol) const
returns a TautomerEnumeratorResult structure for the input molecule
TautomerEnumerator & operator=(const TautomerEnumerator &other)
void setCallback(TautomerEnumeratorCallback *callback)
TautomerEnumerator(const TautomerEnumerator &other)
void setMaxTransforms(unsigned int maxTransforms)
void setMaxTautomers(unsigned int maxTautomers)
ROMol * pickCanonical(const Iterable &tautomers, boost::function< int(const ROMol &mol)> scoreFunc=TautomerScoringFunctions::scoreTautomer) const
Tautomer(ROMOL_SPTR t, ROMOL_SPTR k, size_t a=0, size_t b=0)
RWMol is a molecule class that is intended to be edited.
Definition RWMol.h:32
#define RDKIT_MOLSTANDARDIZE_EXPORT
Definition export.h:353
RDKIT_MOLSTANDARDIZE_EXPORT int scoreHeteroHs(const ROMol &mol)
RDKIT_MOLSTANDARDIZE_EXPORT const std::vector< SubstructTerm > & getDefaultTautomerScoreSubstructs()
RDKIT_MOLSTANDARDIZE_EXPORT int scoreRings(const ROMol &mol)
RDKIT_MOLSTANDARDIZE_EXPORT int scoreSubstructs(const ROMol &mol, const std::vector< SubstructTerm > &terms=getDefaultTautomerScoreSubstructs())
scoreSubstructs scores the molecule based on the substructure definitions
RDKIT_MOLSTANDARDIZE_EXPORT const TautomerTransformDefs defaultTautomerTransformsv1
std::map< std::string, Tautomer > SmilesTautomerMap
TautomerEnumerator * tautomerEnumeratorFromParams(const CleanupParameters &params)
TautomerEnumerator * getV1TautomerEnumerator()
RDCatalog::HierarchCatalog< TautomerCatalogEntry, TautomerCatalogParams, int > TautomerCatalog
std::pair< std::string, Tautomer > SmilesTautomerPair
Std stuff.
bool rdvalue_is(const RDValue_cast_t)
boost::shared_ptr< ROMol > ROMOL_SPTR
SubstructTerm(std::string aname, std::string asmarts, int ascore)