RDKit
Open-source cheminformatics and machine learning.
Loading...
Searching...
No Matches
MolSupplier.h
Go to the documentation of this file.
1//
2// Copyright (C) 2002-2024 greg landrum and other RDKit contributors
3//
4// @@ All Rights Reserved @@
5// This file is part of the RDKit.
6// The contents are covered by the terms of the BSD license
7// which is included in the file license.txt, found at the root
8// of the RDKit source tree.
9//
10#include <RDGeneral/export.h>
11#ifndef RD_MOLSUPPLIER_H
12#define RD_MOLSUPPLIER_H
13
14#include <RDGeneral/types.h>
15
16#include <string>
17#include <string_view>
18#include <list>
19#include <memory>
20#include <vector>
21#include <iostream>
22#include <fstream>
23#include <GraphMol/ROMol.h>
25#include "FileParsers.h"
27
28#ifdef RDK_BUILD_MAEPARSER_SUPPORT
29namespace schrodinger {
30namespace mae {
31class Reader;
32class Block;
33} // namespace mae
34} // namespace schrodinger
35#endif // RDK_BUILD_MAEPARSER_SUPPORT
36
37namespace RDKit {
38RDKIT_FILEPARSERS_EXPORT std::string strip(const std::string &orig);
39
40namespace v2 {
41namespace FileParsers {
42/*!
43//
44// Here are a couple of ways one can interact with MolSuppliers:
45//
46// 1) Lazy (ForwardIterator):
47// while(!supplier.atEnd()){
48// ROMol *mol = supplier.next();
49// if(mol){
50// do something;
51// }
52// }
53// 2) Random Access:
54// for(int i=0;i<supplier.length();i++){
55// ROMol *mol = supplier[i];
56// if(mol){
57// do something;
58// }
59// }
60//
61//
62*/
64 // this is an abstract base class to supply molecules one at a time
65 public:
67 virtual ~MolSupplier() {}
68 virtual void init() = 0;
69 virtual void reset() = 0;
70 virtual bool atEnd() = 0;
71 virtual std::unique_ptr<RWMol> next() = 0;
72
73 virtual void close() {
74 if (df_owner) {
75 delete dp_inStream;
76 df_owner = false;
77 }
78 dp_inStream = nullptr;
79 }
80
81 private:
82 // disable automatic copy constructors and assignment operators
83 // for this class and its subclasses. They will likely be
84 // carrying around stream pointers and copying those is a recipe
85 // for disaster.
86 MolSupplier(const MolSupplier &);
87 MolSupplier &operator=(const MolSupplier &);
88
89 protected:
90 // stream to read the molecules from:
91 std::istream *dp_inStream = nullptr;
92 // do we own dp_inStream?
93 bool df_owner = false;
94 // opens a stream for reading and verifies that it can be read from.
95 // if not it throws an exception
96 // the caller owns the resulting stream
97 std::istream *openAndCheckStream(const std::string &filename) {
98 // FIX: this binary mode of opening file is here because of a bug in
99 // VC++ 6.0
100 // the function "tellg" does not work correctly if we do not open it this
101 // way
102 // Jan 2009: Confirmed that this is still the case in visual studio 2008
103 std::ifstream *strm =
104 new std::ifstream(filename.c_str(), std::ios_base::binary);
105 if ((!(*strm)) || strm->bad()) {
106 std::ostringstream errout;
107 errout << "Bad input file " << filename;
108 delete strm;
109 throw BadFileException(errout.str());
110 }
111
112 strm->peek();
113 if (strm->bad() || strm->eof()) {
114 std::ostringstream errout;
115 errout << "Invalid input file " << filename;
116 delete strm;
117 throw BadFileException(errout.str());
118 }
119 return static_cast<std::istream *>(strm);
120 }
121};
122
123// \brief a supplier from an SD file that only reads forward:
125 /*************************************************************************
126 * A lazy mol supplier from a SD file.
127 * - When new molecules are read using "next" their positions in the file are
128 *noted.
129 ***********************************************************************************/
130 public:
132
134 std::istream *inStream, bool takeOwnership = true,
135 const MolFileParserParams &params = MolFileParserParams());
136
137 ~ForwardSDMolSupplier() override { close(); }
138
139 void init() override;
140 void reset() override;
141 std::unique_ptr<RWMol> next() override;
142 bool atEnd() override;
143
144 void setProcessPropertyLists(bool val) { df_processPropertyLists = val; }
145 bool getProcessPropertyLists() const { return df_processPropertyLists; }
146
147 bool getEOFHitOnRead() const { return df_eofHitOnRead; }
148
149 protected:
150 virtual void checkForEnd();
151 std::unique_ptr<RWMol> _next();
152 virtual void readMolProps(ROMol &);
153 bool df_end = false;
154 int d_line = 0; // line number we are currently on
156 bool df_processPropertyLists = true;
157 bool df_eofHitOnRead = false;
158};
159// \brief a lazy supplier from an SD file
161 /*************************************************************************
162 * A lazy mol supplier from a SD file.
163 * - When new molecules are read using "next" their positions in the file are
164 *noted.
165 * - A call to the "length" will automatically parse the entire file and
166 *cache all the mol
167 * block positions
168 * - [] operator is used to access a molecule at "idx", calling next
169 *following this will result
170 * in the next molecule after "idx"
171 ***********************************************************************************/
172
173 public:
174 SDMolSupplier() { init(); }
175
176 /*!
177 * \param fileName - the name of the SD file
178 * \param sanitize - if true sanitize the molecule before returning it
179 * \param removeHs - if true remove Hs from the molecule before returning it
180 * (triggers sanitization)
181 * \param strictParsing - if set to false, the parser is more lax about
182 * correctness
183 * of the contents.
184 */
186 const std::string &fileName,
187 const MolFileParserParams &params = MolFileParserParams());
188
190 std::istream *inStream, bool takeOwnership = true,
191 const MolFileParserParams &params = MolFileParserParams());
192
193 ~SDMolSupplier() override { close(); }
194 void init() override;
195 void reset() override;
196 std::unique_ptr<RWMol> next() override;
197 bool atEnd() override;
198 void moveTo(unsigned int idx);
199 std::unique_ptr<RWMol> operator[](unsigned int idx);
200 /*! \brief returns the text block for a particular item
201 *
202 * \param idx - which item to return
203 */
204 std::string getItemText(unsigned int idx);
205 unsigned int length();
206 void setData(const std::string &text);
207 void setData(const std::string &text, const MolFileParserParams &params);
208
209 /*! Resets our internal state and sets the indices of molecules in the stream.
210 * The client should be *very* careful about calling this method, as it's
211 *trivial
212 * to end up with a completely useless supplier.
213 *
214 * \param locs - the vector of stream positions.
215 *
216 * Note that this can be used not only to make reading selected molecules
217 *from a
218 * large SD file much faster, but it can also allow subsetting an SD file or
219 * rearranging the order of the molecules.
220 */
221 void setStreamIndices(const std::vector<std::streampos> &locs);
222
223 private:
224 void checkForEnd() override;
225 int d_len = 0; // total number of mol blocks in the file (initialized to -1)
226 int d_last = 0; // the molecule we are ready to read
227 std::vector<std::streampos> d_molpos;
228};
229
231 std::string delimiter = " \t";
233 int nameColumn = 1;
234 bool titleLine = true;
236 true, // sanitize
237 false, // allowCXSMILES
238 true, // strictCXSMILES
239 false, // parseName
240 true, // removeHs
241 false, // skipCleanup
242 false, // debugParse
243 {} // replacements
244 };
245};
246
247//! lazy file parser for Smiles tables
249 /**************************************************************************
250 * Lazy file parser for Smiles table file, similar to the lazy SD
251 * file parser above
252 * - As an when new molecules are read using "next" their
253 * positions in the file are noted.
254 * - A call to the "length" will automatically parse the entire
255 * file and cache all the mol block positions
256 * - [] operator is used to access a molecule at "idx", calling
257 * next following this will result in the next molecule after
258 * "idx"
259 ***************************************************************************/
260 public:
261 /*!
262 * \param fileName - the name of smiles table file
263 * \param delimiter - delimiting characters between records on a each
264 * line NOTE that this is not a string, the tokenizer looks for
265 * the individual characters in delimiter, not the full string
266 * itself. So the default delimiter: " \t", means " " or "\t".
267 * \param smilesColumn - column number for the SMILES string (defaults
268 * to the first column)
269 * \param nameColumn - column number for the molecule name (defaults to
270 * the second column) If set to -1 we assume that no name is
271 * available for the molecule and the name is defaulted to the
272 * smiles string
273 * \param titleLine - if true, the first line is assumed to list the
274 * names of properties in order separated by 'delimiter'. It is
275 * also assume that the 'SMILES' column and the 'name' column
276 * are not specified here if false - no title line is assumed
277 * and the properties are recorded as the "columnX" where "X" is
278 * the column number
279 * \param sanitize - if true sanitize the molecule before returning it
280 */
282 const std::string &fileName,
286 std::istream *inStream, bool takeOwnership = true,
288
289 ~SmilesMolSupplier() override { close(); }
290 void setData(const std::string &text, const SmilesMolSupplierParams &params =
292 void init() override;
293 void reset() override;
294 std::unique_ptr<RWMol> next() override;
295 bool atEnd() override;
296 void moveTo(unsigned int idx);
297 std::unique_ptr<RWMol> operator[](unsigned int idx);
298 /*! \brief returns the text block for a particular item
299 *
300 * \param idx - which item to return
301 */
302 std::string getItemText(unsigned int idx);
303 unsigned int length();
304
305 private:
306 std::unique_ptr<RWMol> processLine(std::string inLine);
307 void processTitleLine();
308 std::string nextLine();
309 long int skipComments();
310 void checkForEnd();
311
312 bool df_end = false; // have we reached the end of the file?
313 long d_len = 0; // total number of smiles in the file
314 long d_next = 0; // the molecule we are ready to read
315 size_t d_line = 0; // line number we are currently on
317 std::vector<std::streampos>
318 d_molpos; // vector of positions in the file for molecules
319 std::vector<int> d_lineNums;
320 STR_VECT d_props; // vector of property names
321};
322
324 std::string nameRecord = "";
325 int confId2D = -1;
326 int confId3D = -1;
328 true, // sanitize
329 false, // allowCXSMILES
330 true, // strictCXSMILES
331 false, // parseName
332 true, // removeHs
333 false, // skipCleanup
334 false, // debugParse
335 {} // replacements
336 };
337};
338
339//! lazy file parser for TDT files
341 /**************************************************************************
342 * Lazy file parser for TDT files, similar to the lazy SD
343 * file parser above
344 * - As an when new molecules are read using "next" their
345 * positions in the file are noted.
346 * - A call to the "length" will automatically parse the entire
347 * file and cache all the mol block positions
348 * - [] operator is used to access a molecule at "idx", calling
349 * next following this will result in the next molecule after
350 * "idx"
351 ***************************************************************************/
352 public:
353 /*!
354 * \param fileName - the name of the TDT file
355 * \param nameRecord - property name for the molecule name.
356 * If empty (the default), the name defaults to be empty
357 * \param confId2D - if >=0 and 2D coordinates are provided, the 2D
358 * structure (depiction) in the input will be read into the
359 * corresponding conformer id.
360 * \param confId3D - if >=0 and 3D coordinates are provided, the 3D
361 * structure (depiction) in the input will be read into the
362 * corresponding conformer id.
363 * \param sanitize - if true sanitize the molecule before returning it
364 */
366 const std::string &fileName,
369 std::istream *inStream, bool takeOwnership = true,
372 ~TDTMolSupplier() override { close(); }
373 void setData(const std::string &text,
375 void init() override;
376 void reset() override;
377 std::unique_ptr<RWMol> next() override;
378 bool atEnd() override;
379 void moveTo(unsigned int idx);
380 std::unique_ptr<RWMol> operator[](unsigned int idx);
381 /*! \brief returns the text block for a particular item
382 *
383 * \param idx - which item to return
384 */
385 std::string getItemText(unsigned int idx);
386 unsigned int length();
387
388 private:
389 bool advanceToNextRecord();
390 void checkForEnd();
391 std::unique_ptr<RWMol> parseMol(std::string inLine);
392
393 bool df_end = false; // have we reached the end of the file?
394 int d_len = 0; // total number of mols in the file
395 int d_last = 0; // the molecule we are ready to read
396 int d_line = 0; // line number we are currently on
397 std::vector<std::streampos>
398 d_molpos; // vector of positions in the file for molecules
399 TDTMolSupplierParams d_params;
400};
401
402//! Deprecated, will be removed in 2024.09 release
404 public:
405 explicit PDBMolSupplier(std::istream *inStream, bool takeOwnership = true,
406 const PDBParserParams &params = PDBParserParams());
407 explicit PDBMolSupplier(const std::string &fname,
408 const PDBParserParams &params = PDBParserParams());
409
410 ~PDBMolSupplier() override { close(); }
411
412 void init() override;
413 void reset() override;
414 std::unique_ptr<RWMol> next() override;
415 bool atEnd() override;
416
417 protected:
419};
420#ifdef RDK_BUILD_MAEPARSER_SUPPORT
422 bool sanitize = true;
423 bool removeHs = true;
424};
425//! lazy file parser for MAE files
427 /**
428 * Due to maeparser's shared_ptr<istream> Reader interface, MaeMolSupplier
429 * always requires taking ownership of the istream ptr, as the shared ptr will
430 * always clear it upon destruction.
431 */
432
433 public:
434 MaeMolSupplier() {}
435
436 explicit MaeMolSupplier(
437 std::shared_ptr<std::istream> inStream,
438 const MaeMolSupplierParams &params = MaeMolSupplierParams());
439
440 explicit MaeMolSupplier(
441 std::istream *inStream, bool takeOwnership = true,
442 const MaeMolSupplierParams &params = MaeMolSupplierParams());
443
444 explicit MaeMolSupplier(
445 const std::string &fname,
446 const MaeMolSupplierParams &params = MaeMolSupplierParams());
447
448 ~MaeMolSupplier() override {}
449
450 void init() override;
451 void reset() override;
452 std::unique_ptr<RWMol> next() override;
453 bool atEnd() override;
454 void moveTo(unsigned int idx);
455 std::unique_ptr<RWMol> operator[](unsigned int idx);
456 unsigned int length();
457
458 void close() override { dp_sInStream.reset(); }
459
460 void setData(const std::string &text,
461 const MaeMolSupplierParams &params = MaeMolSupplierParams());
462
463 private:
464 void moveToNextBlock();
465
466 protected:
467 MaeMolSupplierParams d_params;
468 std::shared_ptr<schrodinger::mae::Reader> d_reader;
469 std::shared_ptr<schrodinger::mae::Block> d_next_struct;
470 std::shared_ptr<std::istream> dp_sInStream;
471 std::string d_stored_exc;
472 unsigned d_position;
473 unsigned d_length;
474};
475#endif // RDK_BUILD_MAEPARSER_SUPPORT
476
477} // namespace FileParsers
478} // namespace v2
479} // namespace RDKit
480
481#include "MolSupplier.v1API.h"
482
483#endif
Defines the primary molecule class ROMol as well as associated typedefs.
used by various file parsing classes to indicate a bad file
ForwardSDMolSupplier(std::istream *inStream, bool takeOwnership=true, const MolFileParserParams &params=MolFileParserParams())
std::unique_ptr< RWMol > next() override
std::istream * openAndCheckStream(const std::string &filename)
Definition MolSupplier.h:97
virtual std::unique_ptr< RWMol > next()=0
Deprecated, will be removed in 2024.09 release.
PDBMolSupplier(std::istream *inStream, bool takeOwnership=true, const PDBParserParams &params=PDBParserParams())
std::unique_ptr< RWMol > next() override
PDBMolSupplier(const std::string &fname, const PDBParserParams &params=PDBParserParams())
void setStreamIndices(const std::vector< std::streampos > &locs)
void setData(const std::string &text)
std::unique_ptr< RWMol > next() override
SDMolSupplier(std::istream *inStream, bool takeOwnership=true, const MolFileParserParams &params=MolFileParserParams())
std::string getItemText(unsigned int idx)
returns the text block for a particular item
void setData(const std::string &text, const MolFileParserParams &params)
SDMolSupplier(const std::string &fileName, const MolFileParserParams &params=MolFileParserParams())
std::unique_ptr< RWMol > operator[](unsigned int idx)
lazy file parser for Smiles tables
std::string getItemText(unsigned int idx)
returns the text block for a particular item
std::unique_ptr< RWMol > next() override
SmilesMolSupplier(std::istream *inStream, bool takeOwnership=true, const SmilesMolSupplierParams &params=SmilesMolSupplierParams())
std::unique_ptr< RWMol > operator[](unsigned int idx)
SmilesMolSupplier(const std::string &fileName, const SmilesMolSupplierParams &params=SmilesMolSupplierParams())
void setData(const std::string &text, const SmilesMolSupplierParams &params=SmilesMolSupplierParams())
lazy file parser for TDT files
std::unique_ptr< RWMol > operator[](unsigned int idx)
void setData(const std::string &text, const TDTMolSupplierParams &params=TDTMolSupplierParams())
TDTMolSupplier(const std::string &fileName, const TDTMolSupplierParams &params=TDTMolSupplierParams())
std::string getItemText(unsigned int idx)
returns the text block for a particular item
TDTMolSupplier(std::istream *inStream, bool takeOwnership=true, const TDTMolSupplierParams &params=TDTMolSupplierParams())
std::unique_ptr< RWMol > next() override
#define RDKIT_FILEPARSERS_EXPORT
Definition export.h:161
Std stuff.
RDKIT_FILEPARSERS_EXPORT std::string strip(const std::string &orig)
std::vector< std::string > STR_VECT
Definition Dict.h:29
bool rdvalue_is(const RDValue_cast_t)
v2::SmilesParse::SmilesParserParams parseParameters
v2::SmilesParse::SmilesParserParams parseParameters