RDKit
Open-source cheminformatics and machine learning.
Loading...
Searching...
No Matches
SmilesWrite.h
Go to the documentation of this file.
1//
2// Copyright (C) 2002-2021 Greg Landrum and other RDKit contributors
3//
4// @@ All Rights Reserved @@
5// This file is part of the RDKit.
6// The contents are covered by the terms of the BSD license
7// which is included in the file license.txt, found at the root
8// of the RDKit source tree.
9//
10#include <RDGeneral/export.h>
11#ifndef RD_SMILESWRITE_H_012020
12#define RD_SMILESWRITE_H_012020
13
14#include <string>
15#include <vector>
16#include <memory>
17#include <cstdint>
18#include <limits>
19
20#include <boost/shared_ptr.hpp>
21
22namespace RDKit {
23class Atom;
24class Bond;
25class ROMol;
26
27typedef std::vector<boost::shared_ptr<ROMol>> MOL_SPTR_VECT;
28
30 bool doIsomericSmiles =
31 true; /**< include stereochemistry and isotope information */
32 bool doKekule = false; /**< kekulize the molecule before generating the SMILES
33 and output single/double bonds. NOTE that the output
34 is not canonical and that this will thrown an
35 exception if the molecule cannot be kekulized. */
36 bool canonical = true; /**< generate canonical SMILES */
37 bool cleanStereo = true; /**< clean up stereo */
38 bool allBondsExplicit = false; /**< include symbols for all bonds */
39 bool allHsExplicit = false; /**< provide hydrogen counts for every atom */
40 bool doRandom = false; /**< randomize the output order. The resulting SMILES
41 is not canonical and the value of the canonical
42 parameter will be ignored. */
43 int rootedAtAtom = -1; /**< make sure the SMILES starts at the specified
44 atom. The resulting SMILES is not canonical and
45 the value of the canonical parameter will be
46 ignored. */
47 bool includeDativeBonds =
48 true; /**< include the RDKit extension for dative bonds. Otherwise dative
49 bonds will be written as single bonds*/
50 bool ignoreAtomMapNumbers = false; /**< If true, ignores any atom map numbers
51 when canonicalizing the molecule */
52};
53
54namespace SmilesWrite {
55
56#define CXSMILESFIELDS_ENUM_ITEMS \
57 CXSMILESFIELDS_ENUM_ITEM(CX_NONE, 0) \
58 CXSMILESFIELDS_ENUM_ITEM(CX_ATOM_LABELS, 1 << 0) \
59 CXSMILESFIELDS_ENUM_ITEM(CX_MOLFILE_VALUES, 1 << 1) \
60 CXSMILESFIELDS_ENUM_ITEM(CX_COORDS, 1 << 2) \
61 CXSMILESFIELDS_ENUM_ITEM(CX_RADICALS, 1 << 3) \
62 CXSMILESFIELDS_ENUM_ITEM(CX_ATOM_PROPS, 1 << 4) \
63 CXSMILESFIELDS_ENUM_ITEM(CX_LINKNODES, 1 << 5) \
64 CXSMILESFIELDS_ENUM_ITEM(CX_ENHANCEDSTEREO, 1 << 6) \
65 CXSMILESFIELDS_ENUM_ITEM(CX_SGROUPS, 1 << 7) \
66 CXSMILESFIELDS_ENUM_ITEM(CX_POLYMER, 1 << 8) \
67 CXSMILESFIELDS_ENUM_ITEM(CX_BOND_CFG, 1 << 9) \
68 CXSMILESFIELDS_ENUM_ITEM(CX_BOND_ATROPISOMER, 1 << 10) \
69 CXSMILESFIELDS_ENUM_ITEM(CX_COORDINATE_BONDS, 1 << 11) \
70 CXSMILESFIELDS_ENUM_ITEM(CX_ALL, 0x7fffffff) \
71 CXSMILESFIELDS_ENUM_ITEM(CX_ALL_BUT_COORDS, CX_ALL ^ CX_COORDS)
72
73#define CXSMILESFIELDS_ENUM_ITEM(k, v) k = (v),
75#undef CXSMILESFIELDS_ENUM_ITEM
76#define CXSMILESFIELDS_STD_MAP_ITEM(k) {#k, SmilesWrite::CXSmilesFields::k},
77#define CXSMILESFIELDS_ENUM_ITEM(k, v) CXSMILESFIELDS_STD_MAP_ITEM(k)
78#define CXSMILESFIELDS_ITEMS_MAP \
79 std::map<std::string, SmilesWrite::CXSmilesFields> { \
80 CXSMILESFIELDS_ENUM_ITEMS \
81 }
82
83//! \brief returns the cxsmiles data for a molecule
85 const ROMol &mol, std::uint32_t flags = CXSmilesFields::CX_ALL);
86
87//! \brief returns the cxsmiles data for a vector of molecules
89 const std::vector<ROMol *> &mols, std::uint32_t flags);
90
91//! \brief returns true if the atom number is in the SMILES organic subset
93
94//! \brief returns the SMILES for an atom
95/*!
96 \param atom : the atom to work with
97 \param ps : the parameters controlling the SMILES generation
98*/
100 const SmilesWriteParams &ps);
101
102//! \brief returns the SMILES for an atom
103/*!
104 \param atom : the atom to work with
105 \param doKekule : we're doing kekulized smiles (e.g. don't use
106 lower case for the atom label)
107 \param bondIn : the bond we came into the atom on (unused)
108 \param allHsExplicit : if true, hydrogen counts will be provided for every
109 atom.
110 \param isomericSmiles : if true, isomeric SMILES will be generated
111*/
112inline std::string GetAtomSmiles(const Atom *atom, bool doKekule = false,
113 const Bond * = nullptr,
114 bool allHsExplicit = false,
115 bool isomericSmiles = true) {
116 // RDUNUSED_PARAM(bondIn);
119 ps.doKekule = doKekule;
120 ps.allHsExplicit = allHsExplicit;
121 return GetAtomSmiles(atom, ps);
122};
123
124//! \brief returns the SMILES for a bond
125/*!
126 \param bond : the bond to work with
127 \param ps : the parameters controlling the SMILES generation
128 \param atomToLeftIdx : the index of the atom preceding \c bond
129 in the SMILES
130*/
132 const SmilesWriteParams &ps,
133 int atomToLeftIdx = -1);
134//! \brief returns the SMILES for a bond
135/*!
136 \param bond : the bond to work with
137 \param atomToLeftIdx : the index of the atom preceding \c bond
138 in the SMILES
139 \param doKekule : we're doing kekulized smiles (e.g. write out
140 bond orders for aromatic bonds)
141 \param allBondsExplicit : if true, symbols will be included for all bonds.
142*/
143inline std::string GetBondSmiles(const Bond *bond, int atomToLeftIdx = -1,
144 bool doKekule = false,
145 bool allBondsExplicit = false) {
147 ps.doKekule = doKekule;
148 ps.allBondsExplicit = allBondsExplicit;
149 ps.doIsomericSmiles = false;
150 return GetBondSmiles(bond, ps, atomToLeftIdx);
151};
152
153namespace detail {
155 const ROMol &mol, const SmilesWriteParams &params, bool doingCXSmiles);
156}
157
158} // namespace SmilesWrite
159
160//! \brief returns canonical SMILES for a molecule
162 const ROMol &mol, const SmilesWriteParams &params);
163
164//! \brief returns SMILES for a molecule, canonical by default
165/*!
166 \param mol : the molecule in question.
167 \param doIsomericSmiles : include stereochemistry and isotope information
168 in the SMILES
169
170 \param doKekule : do Kekule smiles (i.e. don't use aromatic bonds) NOTE that
171 this will throw an exception if the molecule cannot be kekulized.
172
173 \param rootedAtAtom : make sure the SMILES starts at the specified atom.
174 The resulting SMILES is not, of course, canonical.
175 \param canonical : if false, no attempt will be made to canonicalize the
176 SMILES
177 \param allBondsExplicit : if true, symbols will be included for all bonds.
178 \param allHsExplicit : if true, hydrogen counts will be provided for every
179 atom.
180 \param doRandom : if true, the first atom in the SMILES string will be
181 selected at random and the SMILES string will not be canonical
182 \param ignoreAtomMapNumbers : if true, ignores any atom map numbers when
183 canonicalizing the molecule
184 */
185inline std::string MolToSmiles(const ROMol &mol, bool doIsomericSmiles = true,
186 bool doKekule = false, int rootedAtAtom = -1,
187 bool canonical = true,
188 bool allBondsExplicit = false,
189 bool allHsExplicit = false,
190 bool doRandom = false,
191 bool ignoreAtomMapNumbers = false) {
193 ps.doIsomericSmiles = doIsomericSmiles;
194 ps.doKekule = doKekule;
195 ps.rootedAtAtom = rootedAtAtom;
196 ps.canonical = canonical;
197 ps.allBondsExplicit = allBondsExplicit;
198 ps.allHsExplicit = allHsExplicit;
199 ps.doRandom = doRandom;
200 ps.ignoreAtomMapNumbers = ignoreAtomMapNumbers;
201 return MolToSmiles(mol, ps);
202};
203
204//! \brief returns a vector of random SMILES for a molecule (may contain
205//! duplicates)
206/*!
207 \param mol : the molecule in question.
208 \param numSmiles : the number of SMILES to return
209 \param randomSeed : if >0, will be used to seed the random number generator
210 \param doIsomericSmiles : include stereochemistry and isotope information
211 in the SMILES
212 \param doKekule : do Kekule smiles (i.e. don't use aromatic bonds)
213 \param allBondsExplicit : if true, symbols will be included for all bonds.
214 \param allHsExplicit : if true, hydrogen counts will be provided for every
215 atom.
216 */
218 const ROMol &mol, unsigned int numSmiles, unsigned int randomSeed = 0,
219 bool doIsomericSmiles = true, bool doKekule = false,
220 bool allBondsExplicit = false, bool allHsExplicit = false);
221
222//! \brief returns canonical SMILES for part of a molecule
224 const ROMol &mol, const SmilesWriteParams &params,
225 const std::vector<int> &atomsToUse,
226 const std::vector<int> *bondsToUse = nullptr,
227 const std::vector<std::string> *atomSymbols = nullptr,
228 const std::vector<std::string> *bondSymbols = nullptr);
229
230//! \brief returns canonical SMILES for part of a molecule
231/*!
232 \param mol : the molecule in question.
233 \param atomsToUse : indices of the atoms in the fragment
234 \param bondsToUse : indices of the bonds in the fragment. If this is not
235 provided,
236 all bonds between the atoms in atomsToUse will be included
237 \param atomSymbols : symbols to use for the atoms in the output SMILES
238 \param bondSymbols : symbols to use for the bonds in the output SMILES
239 \param doIsomericSmiles : include stereochemistry and isotope information
240 in the SMILES
241 \param doKekule : do Kekule smiles (i.e. don't use aromatic bonds)
242 \param rootedAtAtom : make sure the SMILES starts at the specified atom.
243 The resulting SMILES is not, of course, canonical.
244 \param canonical : if false, no attempt will be made to canonicalize the
245 SMILES
246 \param allBondsExplicit : if true, symbols will be included for all bonds.
247 \param allHsExplicit : if true, hydrogen counts will be provided for every
248 atom.
249 \param doRandom : generate a randomized smiles string by randomly choosing
250 the priority to follow in the DFS traversal. [default false]
251
252 \b NOTE: the bondSymbols are *not* currently used in the canonicalization.
253
254 */
255inline std::string MolFragmentToSmiles(
256 const ROMol &mol, const std::vector<int> &atomsToUse,
257 const std::vector<int> *bondsToUse = nullptr,
258 const std::vector<std::string> *atomSymbols = nullptr,
259 const std::vector<std::string> *bondSymbols = nullptr,
260 bool doIsomericSmiles = true, bool doKekule = false, int rootedAtAtom = -1,
261 bool canonical = true, bool allBondsExplicit = false,
262 bool allHsExplicit = false) {
264 ps.doIsomericSmiles = doIsomericSmiles;
265 ps.doKekule = doKekule;
266 ps.rootedAtAtom = rootedAtAtom;
267 ps.canonical = canonical;
268 ps.allBondsExplicit = allBondsExplicit;
269 ps.allHsExplicit = allHsExplicit;
270 return MolFragmentToSmiles(mol, ps, atomsToUse, bondsToUse, atomSymbols,
272}
273
274#define RESTOREBONDDIROPTION_ENUM_ITEMS \
275 RESTOREBONDDIROPTION_ENUM_ITEM(RestoreBondDirOptionTrue, \
276 0) /*!< DO restore bond dirs */ \
277 RESTOREBONDDIROPTION_ENUM_ITEM(RestoreBondDirOptionClear, \
278 1) /*!< clear all bond dir information */
279
280#define RESTOREBONDDIROPTION_ENUM_ITEM(k, v) k = v,
282#undef RESTOREBONDDIROPTION_ENUM_ITEM
283#define RESTOREBONDDIROPTION_STD_MAP_ITEM(k) {#k, k},
284#define RESTOREBONDDIROPTION_ENUM_ITEM(k, v) \
285 RESTOREBONDDIROPTION_STD_MAP_ITEM(k)
286#define RESTOREBONDDIROPTION_ITEMS_MAP \
287 std::map<std::string, RestoreBondDirOption> { \
288 RESTOREBONDDIROPTION_ENUM_ITEMS \
289 }
290
291//! \brief returns canonical CXSMILES for a molecule
293 const ROMol &mol, const SmilesWriteParams &ps,
294 std::uint32_t flags = SmilesWrite::CXSmilesFields::CX_ALL,
296
297//! \brief returns canonical CXSMILES for a molecule
298/*!
299 \param mol : the molecule in question.
300 \param doIsomericSmiles : include stereochemistry and isotope information
301 in the SMILES
302 \param doKekule : do Kekule smiles (i.e. don't use aromatic bonds)
303 \param rootedAtAtom : make sure the SMILES starts at the specified atom.
304 The resulting SMILES is not, of course, canonical.
305 \param canonical : if false, no attempt will be made to canonicalize the
306 SMILES
307 \param allBondsExplicit : if true, symbols will be included for all bonds.
308 \param allHsExplicit : if true, hydrogen counts will be provided for every
309 \param doRandom : generate a randomized smiles string by randomly choosing
310 the priority to follow in the DFS traversal. [default false]
311 atom.
312 */
313inline std::string MolToCXSmiles(const ROMol &mol, bool doIsomericSmiles = true,
314 bool doKekule = false, int rootedAtAtom = -1,
315 bool canonical = true,
316 bool allBondsExplicit = false,
317 bool allHsExplicit = false,
318 bool doRandom = false) {
320 ps.doIsomericSmiles = doIsomericSmiles;
321 ps.doKekule = doKekule;
322 ps.rootedAtAtom = rootedAtAtom;
323 ps.canonical = canonical;
324 ps.allBondsExplicit = allBondsExplicit;
325 ps.allHsExplicit = allHsExplicit;
326 ps.doRandom = doRandom;
327 return MolToCXSmiles(mol, ps, SmilesWrite::CXSmilesFields::CX_ALL);
328};
329
330//! \brief returns canonical CXSMILES for part of a molecule
332 const ROMol &mol, const SmilesWriteParams &params,
333 const std::vector<int> &atomsToUse,
334 const std::vector<int> *bondsToUse = nullptr,
335 const std::vector<std::string> *atomSymbols = nullptr,
336 const std::vector<std::string> *bondSymbols = nullptr);
337
338//! \brief returns canonical CXSMILES for part of a molecule
339/*!
340 \param mol : the molecule in question.
341 \param atomsToUse : indices of the atoms in the fragment
342 \param bondsToUse : indices of the bonds in the fragment. If this is not
343 provided,
344 all bonds between the atoms in atomsToUse will be included
345 \param atomSymbols : symbols to use for the atoms in the output SMILES
346 \param bondSymbols : symbols to use for the bonds in the output SMILES
347 \param doIsomericSmiles : include stereochemistry and isotope information
348 in the SMILES
349 \param doKekule : do Kekule smiles (i.e. don't use aromatic bonds)
350 \param rootedAtAtom : make sure the SMILES starts at the specified atom.
351 The resulting SMILES is not, of course, canonical.
352 \param canonical : if false, no attempt will be made to canonicalize the
353 SMILES
354 \param allBondsExplicit : if true, symbols will be included for all bonds.
355 \param allHsExplicit : if true, hydrogen counts will be provided for every
356 atom.
357
358 \b NOTE: the bondSymbols are *not* currently used in the canonicalization.
359
360 */
361inline std::string MolFragmentToCXSmiles(
362 const ROMol &mol, const std::vector<int> &atomsToUse,
363 const std::vector<int> *bondsToUse = nullptr,
364 const std::vector<std::string> *atomSymbols = nullptr,
365 const std::vector<std::string> *bondSymbols = nullptr,
366 bool doIsomericSmiles = true, bool doKekule = false, int rootedAtAtom = -1,
367 bool canonical = true, bool allBondsExplicit = false,
368 bool allHsExplicit = false) {
370 ps.doIsomericSmiles = doIsomericSmiles;
371 ps.doKekule = doKekule;
372 ps.rootedAtAtom = rootedAtAtom;
373 ps.canonical = canonical;
374 ps.allBondsExplicit = allBondsExplicit;
375 ps.allHsExplicit = allHsExplicit;
376 return MolFragmentToCXSmiles(mol, ps, atomsToUse, bondsToUse, atomSymbols,
378}
379
381 const std::string &details_json);
383 const char *details_json);
386 const std::string &details_json);
389 const char *details_json);
390
391} // namespace RDKit
392#endif
The class for representing atoms.
Definition Atom.h:75
class for representing a bond
Definition Bond.h:47
#define RDKIT_SMILESPARSE_EXPORT
Definition export.h:505
RDKIT_SMILESPARSE_EXPORT std::string MolToSmiles(const ROMol &mol, const SmilesWriteParams &params, bool doingCXSmiles)
RDKIT_SMILESPARSE_EXPORT std::string GetAtomSmiles(const Atom *atom, const SmilesWriteParams &ps)
returns the SMILES for an atom
RDKIT_SMILESPARSE_EXPORT bool inOrganicSubset(int atomicNumber)
returns true if the atom number is in the SMILES organic subset
RDKIT_SMILESPARSE_EXPORT std::string GetBondSmiles(const Bond *bond, const SmilesWriteParams &ps, int atomToLeftIdx=-1)
returns the SMILES for a bond
RDKIT_SMILESPARSE_EXPORT std::string getCXExtensions(const ROMol &mol, std::uint32_t flags=CXSmilesFields::CX_ALL)
returns the cxsmiles data for a molecule
Std stuff.
RDKIT_SMILESPARSE_EXPORT std::vector< std::string > MolToRandomSmilesVect(const ROMol &mol, unsigned int numSmiles, unsigned int randomSeed=0, bool doIsomericSmiles=true, bool doKekule=false, bool allBondsExplicit=false, bool allHsExplicit=false)
returns a vector of random SMILES for a molecule (may contain duplicates)
void updateSmilesWriteParamsFromJSON(SmilesWriteParams &params, const std::string &details_json)
bool rdvalue_is(const RDValue_cast_t)
RDKIT_SMILESPARSE_EXPORT std::string MolFragmentToSmiles(const ROMol &mol, const SmilesWriteParams &params, const std::vector< int > &atomsToUse, const std::vector< int > *bondsToUse=nullptr, const std::vector< std::string > *atomSymbols=nullptr, const std::vector< std::string > *bondSymbols=nullptr)
returns canonical SMILES for part of a molecule
RDKIT_SMILESPARSE_EXPORT std::string MolToSmiles(const ROMol &mol, const SmilesWriteParams &params)
returns canonical SMILES for a molecule
RDKIT_SMILESPARSE_EXPORT std::string MolToCXSmiles(const ROMol &mol, const SmilesWriteParams &ps, std::uint32_t flags=SmilesWrite::CXSmilesFields::CX_ALL, RestoreBondDirOption restoreBondDirs=RestoreBondDirOptionClear)
returns canonical CXSMILES for a molecule
void updateCXSmilesFieldsFromJSON(SmilesWrite::CXSmilesFields &cxSmilesFields, RestoreBondDirOption &restoreBondDirs, const std::string &details_json)
RDKIT_SMILESPARSE_EXPORT std::string MolFragmentToCXSmiles(const ROMol &mol, const SmilesWriteParams &params, const std::vector< int > &atomsToUse, const std::vector< int > *bondsToUse=nullptr, const std::vector< std::string > *atomSymbols=nullptr, const std::vector< std::string > *bondSymbols=nullptr)
returns canonical CXSMILES for part of a molecule
std::vector< boost::shared_ptr< ROMol > > MOL_SPTR_VECT
RestoreBondDirOption
@ RESTOREBONDDIROPTION_ENUM_ITEMS