RDKit
Open-source cheminformatics and machine learning.
Loading...
Searching...
No Matches
FileParsers.h
Go to the documentation of this file.
1//
2// Copyright (C) 2002-2024 Greg Landrum and other RDKit contributors
3//
4// @@ All Rights Reserved @@
5// This file is part of the RDKit.
6// The contents are covered by the terms of the BSD license
7// which is included in the file license.txt, found at the root
8// of the RDKit source tree.
9//
10#include <RDGeneral/export.h>
11#ifndef RD_FILEPARSERS_H
12#define RD_FILEPARSERS_H
13
14#include <RDGeneral/types.h>
15#include <GraphMol/RDKitBase.h>
17#include "CDXMLParser.h"
18#include <string>
19#include <string_view>
20#include <iostream>
21#include <vector>
22#include <exception>
23
24#include <boost/shared_ptr.hpp>
25
26namespace RDKit {
27
28RDKIT_FILEPARSERS_EXPORT std::string strip(const std::string &orig);
29
30namespace v2 {
31namespace FileParsers {
33 : public std::exception {
34 public:
35 //! construct with an error message
36 explicit MolFileUnhandledFeatureException(const char *msg) : _msg(msg) {}
37 //! construct with an error message
38 explicit MolFileUnhandledFeatureException(const std::string msg)
39 : _msg(msg) {}
40 //! get the error message
41 const char *what() const noexcept override { return _msg.c_str(); }
42 ~MolFileUnhandledFeatureException() noexcept override = default;
43
44 private:
45 std::string _msg;
46};
47
49 bool sanitize = true; /**< sanitize the molecule after building it */
50 bool removeHs = true; /**< remove Hs after constructing the molecule */
51 bool strictParsing = true; /**< if set to false, the parser is more lax about
52 correctness of the contents. */
53 bool expandAttachmentPoints =
54 false; /**< toggle conversion of attachment points into dummy atoms */
55};
57 std::istream &inStream, unsigned int &line,
60 const std::string &molBlock,
63 const std::string &fName,
65
66} // namespace FileParsers
67} // namespace v2
68
69inline namespace v1 {
71//-----
72// mol files
73//-----
74// \brief construct a molecule from MDL mol data in a stream
75/*!
76 * \param inStream - stream containing the data
77 * \param line - current line number (used for error reporting)
78 * \param sanitize - toggles sanitization and stereochemistry
79 * perception of the molecule
80 * \param removeHs - toggles removal of Hs from the molecule. H removal
81 * is only done if the molecule is sanitized
82 * \param line - current line number (used for error reporting)
83 * \param strictParsing - if set to false, the parser is more lax about
84 * correctness of the contents.
85 *
86 */
87inline RWMol *MolDataStreamToMol(std::istream *inStream, unsigned int &line,
88 bool sanitize = true, bool removeHs = true,
89 bool strictParsing = true) {
91 ps.sanitize = sanitize;
92 ps.removeHs = removeHs;
93 ps.strictParsing = strictParsing;
94 return v2::FileParsers::MolFromMolDataStream(*inStream, line, ps).release();
95};
96// \overload
97inline RWMol *MolDataStreamToMol(std::istream &inStream, unsigned int &line,
98 bool sanitize = true, bool removeHs = true,
99 bool strictParsing = true) {
100 return MolDataStreamToMol(&inStream, line, sanitize, removeHs, strictParsing);
101};
102// \brief construct a molecule from an MDL mol block
103/*!
104 * \param molBlock - string containing the mol block
105 * \param sanitize - toggles sanitization and stereochemistry
106 * perception of the molecule
107 * \param removeHs - toggles removal of Hs from the molecule. H removal
108 * is only done if the molecule is sanitized
109 * \param strictParsing - if set to false, the parser is more lax about
110 * correctness of the contents.
111 */
112inline RWMol *MolBlockToMol(const std::string &molBlock, bool sanitize = true,
113 bool removeHs = true, bool strictParsing = true) {
115 ps.sanitize = sanitize;
116 ps.removeHs = removeHs;
117 ps.strictParsing = strictParsing;
118 return v2::FileParsers::MolFromMolBlock(molBlock, ps).release();
119};
120
121// \brief construct a molecule from an MDL mol file
122/*!
123 * \param fName - string containing the file name
124 * \param sanitize - toggles sanitization and stereochemistry
125 * perception of the molecule
126 * \param removeHs - toggles removal of Hs from the molecule. H removal
127 * is only done if the molecule is sanitized
128 * \param strictParsing - if set to false, the parser is more lax about
129 * correctness of the contents.
130 */
131inline RWMol *MolFileToMol(const std::string &fName, bool sanitize = true,
132 bool removeHs = true, bool strictParsing = true) {
134 ps.sanitize = sanitize;
135 ps.removeHs = removeHs;
136 ps.strictParsing = strictParsing;
137 return v2::FileParsers::MolFromMolFile(fName, ps).release();
138};
139} // namespace v1
140
141//-----
142// TPL handling:
143//-----
144
145namespace v2 {
146namespace FileParsers {
148 bool sanitize = true; /**< sanitize the molecule after building it */
149 bool skipFirstConf =
150 false; /**< if set to true, the first conformer will be skipped */
151};
153 std::istream &inStream, unsigned int &line,
154 const TPLParserParams &params = TPLParserParams());
156 const std::string &fName,
157 const TPLParserParams &params = TPLParserParams());
158
159} // namespace FileParsers
160} // namespace v2
161
162inline namespace v1 {
163//! \brief translate TPL data (BioCad format) into a multi-conf molecule
164/*!
165 \param inStream: the stream from which to read
166 \param line: used to track the line number of errors
167 \param sanitize: toggles sanitization and stereochemistry
168 perception of the molecule
169 \param skipFirstConf: according to the TPL format description, the atomic
170 coords in the atom-information block describe the first
171 conformation and the first conf block describes second
172 conformation. The CombiCode, on the other hand, writes
173 the first conformation data both to the atom-information
174 block and to the first conf block. We want to be able to
175 read CombiCode-style tpls, so we'll allow this
176 mis-feature
177 to be parsed when this flag is set.
178*/
179inline RWMol *TPLDataStreamToMol(std::istream *inStream, unsigned int &line,
180 bool sanitize = true,
181 bool skipFirstConf = false) {
183 ps.sanitize = sanitize;
184 ps.skipFirstConf = skipFirstConf;
185 return v2::FileParsers::MolFromTPLDataStream(*inStream, line, ps).release();
186}
187
188//! \brief construct a multi-conf molecule from a TPL (BioCad format) file
189/*!
190 \param fName: the name of the file from which to read
191 \param sanitize: toggles sanitization and stereochemistry
192 perception of the molecule
193 \param skipFirstConf: according to the TPL format description, the atomic
194 coords in the atom-information block describe the first
195 conformation and the first conf block describes second
196 conformation. The CombiCode, on the other hand, writes
197 the first conformation data both to the atom-information
198 block and to the first conf block. We want to be able to
199 read CombiCode-style tpls, so we'll allow this
200 mis-feature
201 to be parsed when this flag is set.
202*/
203inline RWMol *TPLFileToMol(const std::string &fName, bool sanitize = true,
204 bool skipFirstConf = false) {
206 ps.sanitize = sanitize;
207 ps.skipFirstConf = skipFirstConf;
208 return v2::FileParsers::MolFromTPLFile(fName, ps).release();
209}
210} // namespace v1
211
212namespace v2 {
213namespace FileParsers {
214
215//-----
216// MOL2 handling
217//-----
218
219typedef enum {
220 CORINA = 0 //!< supports output from Corina and some dbtranslate output
221} Mol2Type;
222
224 bool sanitize = true; /**< sanitize the molecule after building it */
225 bool removeHs = true; /**< remove Hs after constructing the molecule */
226 Mol2Type variant = Mol2Type::CORINA; /**< the atom type definitions to use */
227 bool cleanupSubstructures =
228 true; /**< toggles recognition and cleanup of common substructures */
229};
230
232 std::istream &inStream,
233 const Mol2ParserParams &params = Mol2ParserParams());
235 const std::string &molBlock,
236 const Mol2ParserParams &params = Mol2ParserParams());
238 const std::string &fName,
239 const Mol2ParserParams &params = Mol2ParserParams());
240
241} // namespace FileParsers
242} // namespace v2
243
244inline namespace v1 {
246
247// \brief construct a molecule from a Tripos mol2 file
248/*!
249 *
250 * \param fName - string containing the file name
251 * \param sanitize - toggles sanitization of the molecule
252 * \param removeHs - toggles removal of Hs from the molecule. H removal
253 * is only done if the molecule is sanitized
254 * \param variant - the atom type definitions to use
255 * \param cleanupSubstructures - toggles recognition and cleanup of common
256 * substructures
257 */
258inline RWMol *Mol2FileToMol(const std::string &fName, bool sanitize = true,
259 bool removeHs = true,
260 Mol2Type variant = Mol2Type::CORINA,
261 bool cleanupSubstructures = true) {
263 ps.sanitize = sanitize;
264 ps.removeHs = removeHs;
265 ps.variant = variant;
266 ps.cleanupSubstructures = cleanupSubstructures;
267 return v2::FileParsers::MolFromMol2File(fName, ps).release();
268}
269
270// \brief construct a molecule from Tripos mol2 data in a stream
271/*!
272 * \param inStream - stream containing the data
273 * \param sanitize - toggles sanitization of the molecule
274 * \param removeHs - toggles removal of Hs from the molecule. H removal
275 * is only done if the molecule is sanitized
276 * \param variant - the atom type definitions to use
277 * \param cleanupSubstructures - toggles recognition and cleanup of common
278 * substructures
279 */
280inline RWMol *Mol2DataStreamToMol(std::istream &inStream, bool sanitize = true,
281 bool removeHs = true,
282 Mol2Type variant = Mol2Type::CORINA,
283 bool cleanupSubstructures = true) {
285 ps.sanitize = sanitize;
286 ps.removeHs = removeHs;
287 ps.variant = variant;
288 ps.cleanupSubstructures = cleanupSubstructures;
289 return v2::FileParsers::MolFromMol2DataStream(inStream, ps).release();
290}
291// \overload
292inline RWMol *Mol2DataStreamToMol(std::istream *inStream, bool sanitize = true,
293 bool removeHs = true,
294 Mol2Type variant = Mol2Type::CORINA,
295 bool cleanupSubstructures = true) {
296 return Mol2DataStreamToMol(*inStream, sanitize, removeHs, variant,
297 cleanupSubstructures);
298}
299
300// \brief construct a molecule from a Tripos mol2 block
301/*!
302 * \param molBlock - string containing the mol block
303 * \param sanitize - toggles sanitization of the molecule
304 * \param removeHs - toggles removal of Hs from the molecule. H removal
305 * is only done if the molecule is sanitized
306 * \param variant - the atom type definitions to use
307 * \param cleanupSubstructures - toggles recognition and cleanup of common
308 * substructures
309 */
310inline RWMol *Mol2BlockToMol(const std::string &molBlock, bool sanitize = true,
311 bool removeHs = true,
312 Mol2Type variant = Mol2Type::CORINA,
313 bool cleanupSubstructures = true) {
315 ps.sanitize = sanitize;
316 ps.removeHs = removeHs;
317 ps.variant = variant;
318 ps.cleanupSubstructures = cleanupSubstructures;
319 return v2::FileParsers::MolFromMol2Block(molBlock, ps).release();
320}
321} // namespace v1
322
323namespace v2 {
324namespace FileParsers {
325
327 std::istream &inStream);
328// \brief construct a molecule from an xyz block
329/*!
330 * \param xyzBlock - string containing the xyz block
331 */
333 const std::string &xyzBlock);
334// \brief construct a molecule from an xyz file
335/*!
336 * \param fName - string containing the file name
337 */
339 const std::string &fName);
340} // namespace FileParsers
341} // namespace v2
342inline namespace v1 {
343inline RWMol *XYZDataStreamToMol(std::istream &inStream) {
344 return v2::FileParsers::MolFromXYZDataStream(inStream).release();
345}
346// \brief construct a molecule from an xyz block
347/*!
348 * \param xyzBlock - string containing the xyz block
349 */
350inline RWMol *XYZBlockToMol(const std::string &xyzBlock) {
351 return v2::FileParsers::MolFromXYZBlock(xyzBlock).release();
352}
353// \brief construct a molecule from an xyz file
354/*!
355 * \param fName - string containing the file name
356 */
357inline RWMol *XYZFileToMol(const std::string &fName) {
358 return v2::FileParsers::MolFromXYZFile(fName).release();
359}
360
361} // namespace v1
362
363namespace v2 {
364namespace FileParsers {
366 bool sanitize = true; /**< sanitize the molecule after building it */
367 bool removeHs = true; /**< remove Hs after constructing the molecule */
368 bool proximityBonding = true; /**< if set to true, proximity bonding will be
369 performed */
370 unsigned int flavor = 0; /**< flavor to use */
371};
372
374 std::istream &inStream, const PDBParserParams &params = PDBParserParams());
376 const std::string &fname,
377 const PDBParserParams &params = PDBParserParams());
379 const std::string &str, const PDBParserParams &params = PDBParserParams());
380} // namespace FileParsers
381} // namespace v2
382
383inline namespace v1 {
385inline RWMol *PDBBlockToMol(const std::string &str, bool sanitize = true,
386 bool removeHs = true, unsigned int flavor = 0,
387 bool proximityBonding = true) {
389 ps.sanitize = sanitize;
390 ps.removeHs = removeHs;
391 ps.flavor = flavor;
392 ps.proximityBonding = proximityBonding;
393 return v2::FileParsers::MolFromPDBBlock(str, ps).release();
394}
395inline RWMol *PDBBlockToMol(const char *str, bool sanitize = true,
396 bool removeHs = true, unsigned int flavor = 0,
397 bool proximityBonding = true) {
398 return PDBBlockToMol(std::string(str), sanitize, removeHs, flavor,
399 proximityBonding);
400}
401inline RWMol *PDBFileToMol(const std::string &fname, bool sanitize = true,
402 bool removeHs = true, unsigned int flavor = 0,
403 bool proximityBonding = true) {
405 ps.sanitize = sanitize;
406 ps.removeHs = removeHs;
407 ps.flavor = flavor;
408 ps.proximityBonding = proximityBonding;
409 return v2::FileParsers::MolFromPDBFile(fname, ps).release();
410}
411inline RWMol *PDBDataStreamToMol(std::istream &inStream, bool sanitize = true,
412 bool removeHs = true, unsigned int flavor = 0,
413 bool proximityBonding = true) {
415 ps.sanitize = sanitize;
416 ps.removeHs = removeHs;
417 ps.flavor = flavor;
418 ps.proximityBonding = proximityBonding;
419 return v2::FileParsers::MolFromPDBDataStream(inStream, ps).release();
420}
421inline RWMol *PDBDataStreamToMol(std::istream *inStream, bool sanitize = true,
422 bool removeHs = true, unsigned int flavor = 0,
423 bool proximityBonding = true) {
424 return PDBDataStreamToMol(*inStream, sanitize, removeHs, flavor,
425 proximityBonding);
426}
427} // namespace v1
428
429// \brief reads a molecule from the metadata in an RDKit-generated SVG file
430/*!
431 * \param svg - string containing the SVG
432 * \param sanitize - toggles sanitization of the molecule
433 * \param removeHs - toggles removal of Hs from the molecule. H removal
434 * is only done if the molecule is sanitized
435 *
436 * **NOTE** This functionality should be considered beta.
437 */
439 bool sanitize = true,
440 bool removeHs = true);
441/*! \overload
442 */
444 bool sanitize = true,
445 bool removeHs = true);
446
447inline std::unique_ptr<RDKit::RWMol> operator"" _ctab(const char *text,
448 size_t len) {
449 std::string data(text, len);
450 try {
452 } catch (const RDKit::MolSanitizeException &) {
453 return nullptr;
454 }
455}
456inline std::unique_ptr<RDKit::RWMol> operator"" _mol2(const char *text,
457 size_t len) {
458 std::string data(text, len);
459 try {
461 } catch (const RDKit::MolSanitizeException &) {
462 return nullptr;
463 }
464}
465
466inline std::unique_ptr<RDKit::RWMol> operator"" _pdb(const char *text,
467 size_t len) {
468 std::string data(text, len);
469 try {
471 } catch (const RDKit::MolSanitizeException &) {
472 return nullptr;
473 }
474}
475
476} // namespace RDKit
477
478#endif
pulls in the core RDKit functionality
class for flagging sanitization errors
RWMol is a molecule class that is intended to be edited.
Definition RWMol.h:32
const char * what() const noexcept override
get the error message
Definition FileParsers.h:41
MolFileUnhandledFeatureException(const std::string msg)
construct with an error message
Definition FileParsers.h:38
~MolFileUnhandledFeatureException() noexcept override=default
MolFileUnhandledFeatureException(const char *msg)
construct with an error message
Definition FileParsers.h:36
#define RDKIT_FILEPARSERS_EXPORT
Definition export.h:161
RWMol * Mol2BlockToMol(const std::string &molBlock, bool sanitize=true, bool removeHs=true, Mol2Type variant=Mol2Type::CORINA, bool cleanupSubstructures=true)
RWMol * XYZFileToMol(const std::string &fName)
RWMol * Mol2FileToMol(const std::string &fName, bool sanitize=true, bool removeHs=true, Mol2Type variant=Mol2Type::CORINA, bool cleanupSubstructures=true)
RWMol * MolFileToMol(const std::string &fName, bool sanitize=true, bool removeHs=true, bool strictParsing=true)
RWMol * Mol2DataStreamToMol(std::istream &inStream, bool sanitize=true, bool removeHs=true, Mol2Type variant=Mol2Type::CORINA, bool cleanupSubstructures=true)
RWMol * MolBlockToMol(const std::string &molBlock, bool sanitize=true, bool removeHs=true, bool strictParsing=true)
RWMol * PDBDataStreamToMol(std::istream &inStream, bool sanitize=true, bool removeHs=true, unsigned int flavor=0, bool proximityBonding=true)
RWMol * MolDataStreamToMol(std::istream *inStream, unsigned int &line, bool sanitize=true, bool removeHs=true, bool strictParsing=true)
Definition FileParsers.h:87
RWMol * TPLFileToMol(const std::string &fName, bool sanitize=true, bool skipFirstConf=false)
construct a multi-conf molecule from a TPL (BioCad format) file
RWMol * PDBFileToMol(const std::string &fname, bool sanitize=true, bool removeHs=true, unsigned int flavor=0, bool proximityBonding=true)
RWMol * XYZDataStreamToMol(std::istream &inStream)
RWMol * TPLDataStreamToMol(std::istream *inStream, unsigned int &line, bool sanitize=true, bool skipFirstConf=false)
translate TPL data (BioCad format) into a multi-conf molecule
RWMol * XYZBlockToMol(const std::string &xyzBlock)
RWMol * PDBBlockToMol(const std::string &str, bool sanitize=true, bool removeHs=true, unsigned int flavor=0, bool proximityBonding=true)
RDKIT_FILEPARSERS_EXPORT std::unique_ptr< RWMol > MolFromTPLFile(const std::string &fName, const TPLParserParams &params=TPLParserParams())
RDKIT_FILEPARSERS_EXPORT std::unique_ptr< RWMol > MolFromPDBFile(const std::string &fname, const PDBParserParams &params=PDBParserParams())
RDKIT_FILEPARSERS_EXPORT std::unique_ptr< RWMol > MolFromPDBDataStream(std::istream &inStream, const PDBParserParams &params=PDBParserParams())
RDKIT_FILEPARSERS_EXPORT std::unique_ptr< RWMol > MolFromXYZFile(const std::string &fName)
RDKIT_FILEPARSERS_EXPORT std::unique_ptr< RWMol > MolFromXYZBlock(const std::string &xyzBlock)
RDKIT_FILEPARSERS_EXPORT std::unique_ptr< RWMol > MolFromMolFile(const std::string &fName, const MolFileParserParams &params=MolFileParserParams())
RDKIT_FILEPARSERS_EXPORT std::unique_ptr< RWMol > MolFromMol2DataStream(std::istream &inStream, const Mol2ParserParams &params=Mol2ParserParams())
RDKIT_FILEPARSERS_EXPORT std::unique_ptr< RWMol > MolFromPDBBlock(const std::string &str, const PDBParserParams &params=PDBParserParams())
RDKIT_FILEPARSERS_EXPORT std::unique_ptr< RWMol > MolFromMol2File(const std::string &fName, const Mol2ParserParams &params=Mol2ParserParams())
RDKIT_FILEPARSERS_EXPORT std::unique_ptr< RWMol > MolFromXYZDataStream(std::istream &inStream)
RDKIT_FILEPARSERS_EXPORT std::unique_ptr< RWMol > MolFromMolBlock(const std::string &molBlock, const MolFileParserParams &params=MolFileParserParams())
RDKIT_FILEPARSERS_EXPORT std::unique_ptr< RWMol > MolFromMol2Block(const std::string &molBlock, const Mol2ParserParams &params=Mol2ParserParams())
RDKIT_FILEPARSERS_EXPORT std::unique_ptr< RWMol > MolFromMolDataStream(std::istream &inStream, unsigned int &line, const MolFileParserParams &params=MolFileParserParams())
@ CORINA
supports output from Corina and some dbtranslate output
RDKIT_FILEPARSERS_EXPORT std::unique_ptr< RWMol > MolFromTPLDataStream(std::istream &inStream, unsigned int &line, const TPLParserParams &params=TPLParserParams())
Std stuff.
RDKIT_FILEPARSERS_EXPORT std::string strip(const std::string &orig)
bool rdvalue_is(const RDValue_cast_t)
RDKIT_FILEPARSERS_EXPORT RWMol * RDKitSVGToMol(const std::string &svg, bool sanitize=true, bool removeHs=true)