RDKit
Open-source cheminformatics and machine learning.
MolSupplier.h
Go to the documentation of this file.
1 //
2 // Copyright (C) 2002-2013 greg landrum, Rational Discovery LLC
3 //
4 // @@ All Rights Reserved @@
5 // This file is part of the RDKit.
6 // The contents are covered by the terms of the BSD license
7 // which is included in the file license.txt, found at the root
8 // of the RDKit source tree.
9 //
10 #ifndef _RD_MOLSUPPLIER_H
11 #define _RD_MOLSUPPLIER_H
12 
13 #include <RDGeneral/types.h>
14 
15 #include <string>
16 #include <list>
17 #include <vector>
18 #include <iostream>
19 #include <GraphMol/ROMol.h>
20 
21 namespace RDKit {
22 std::string strip(const std::string &orig);
23 
24 /*!
25 //
26 // Here are a couple of ways one can interact with MolSuppliers:
27 //
28 // 1) Lazy (ForwardIterator):
29 // while(!supplier.atEnd()){
30 // ROMol *mol = supplier.next();
31 // if(mol){
32 // do something;
33 // }
34 // }
35 // 2) Random Access:
36 // for(int i=0;i<supplier.length();i++){
37 // ROMol *mol = supplier[i];
38 // if(mol){
39 // do something;
40 // }
41 // }
42 //
43 //
44 */
45 class MolSupplier {
46  // this is an abstract base class to supply molecules one at a time
47  public:
49  virtual ~MolSupplier(){};
50  virtual void init() = 0;
51  virtual void reset() = 0;
52  virtual bool atEnd() = 0;
53  virtual ROMol *next() = 0;
54 
55  private:
56  // disable automatic copy constructors and assignment operators
57  // for this class and its subclasses. They will likely be
58  // carrying around stream pointers and copying those is a recipe
59  // for disaster.
60  MolSupplier(const MolSupplier &);
61  MolSupplier &operator=(const MolSupplier &);
62 
63  protected:
64  // stream to read the molecules from:
65  std::istream *dp_inStream;
66  // do we own dp_inStream?
67  bool df_owner;
68 };
69 
70 // \brief a supplier from an SD file that only reads forward:
72  /*************************************************************************
73  * A lazy mol supplier from a SD file.
74  * - When new molecules are read using "next" their positions in the file are
75  *noted.
76  ***********************************************************************************/
77  public:
79 
80  explicit ForwardSDMolSupplier(std::istream *inStream,
81  bool takeOwnership = true, bool sanitize = true,
82  bool removeHs = true,
83  bool strictParsing = false);
84 
86  if (df_owner && dp_inStream) {
87  delete dp_inStream;
88  df_owner = false;
89  dp_inStream = NULL;
90  }
91  };
92 
93  virtual void init();
94  virtual void reset();
95  virtual ROMol *next();
96  virtual bool atEnd();
97 
98  protected:
99  virtual void checkForEnd();
100  ROMol *_next();
101  virtual void readMolProps(ROMol *);
102  bool df_end;
103  int d_line; // line number we are currently on
104  bool df_sanitize, df_removeHs, df_strictParsing;
105 };
106 
107 // \brief a lazy supplier from an SD file
109  /*************************************************************************
110  * A lazy mol supplier from a SD file.
111  * - When new molecules are read using "next" their positions in the file are
112  *noted.
113  * - A call to the "length" will automatically parse the entire file and
114  *cache all the mol
115  * block positions
116  * - [] operator is used to access a molecule at "idx", calling next
117  *following this will result
118  * in the next molecule after "idx"
119  ***********************************************************************************/
120 
121  public:
122  SDMolSupplier() { init(); };
123 
124  /*!
125  * \param fileName - the name of the SD file
126  * \param sanitize - if true sanitize the molecule before returning it
127  * \param removeHs - if true remove Hs from the molecule before returning it
128  * (triggers sanitization)
129  * \param strictParsing - if not set, the parser is more lax about
130  * correctness
131  * of the contents.
132  */
133  explicit SDMolSupplier(const std::string &fileName, bool sanitize = true,
134  bool removeHs = true, bool strictParsing = true);
135 
136  explicit SDMolSupplier(std::istream *inStream, bool takeOwnership = true,
137  bool sanitize = true, bool removeHs = true,
138  bool strictParsing = true);
139 
141  void init();
142  void reset();
143  ROMol *next();
144  bool atEnd();
145  void moveTo(unsigned int idx);
146  ROMol *operator[](unsigned int idx);
147  /*! \brief returns the text block for a particular item
148  *
149  * \param idx - which item to return
150  */
151  std::string getItemText(unsigned int idx);
152  unsigned int length();
153  void setData(const std::string &text, bool sanitize = true,
154  bool removeHs = true);
155  void setData(const std::string &text, bool sanitize, bool removeHs,
156  bool strictParsing);
157 
158  /*! Resets our internal state and sets the indices of molecules in the stream.
159  * The client should be *very* careful about calling this method, as it's
160  *trivial
161  * to end up with a completely useless supplier.
162  *
163  * \param locs - the vector of stream positions.
164  *
165  * Note that this can be used not only to make reading selected molecules
166  *from a
167  * large SD file much faster, but it can also allow subsetting an SD file or
168  * rearranging the order of the molecules.
169  */
170  void setStreamIndices(const std::vector<std::streampos> &locs);
171 
172  private:
173  void checkForEnd();
174  void setDataCommon(const std::string &text, bool sanitize, bool removeHs);
175  int d_len; // total number of mol blocks in the file (initialized to -1)
176  int d_last; // the molecule we are ready to read
177  std::vector<std::streampos> d_molpos;
178 };
179 
180 //! lazy file parser for Smiles tables
182  /**************************************************************************
183  * Lazy file parser for Smiles table file, similar to the lazy SD
184  * file parser above
185  * - As an when new molecules are read using "next" their
186  * positions in the file are noted.
187  * - A call to the "length" will autamatically parse the entire
188  * file and cache all the mol block positions
189  * - [] operator is used to access a molecule at "idx", calling
190  * next following this will result in the next molecule after
191  * "idx"
192  ***************************************************************************/
193  public:
194  /*!
195  * \param fileName - the name of smiles table file
196  * \param delimiter - delimiting characters between records on a each
197  * line NOTE that this is not a string, the tokenizer looks for
198  * the individual characters in delimiter, not the full string
199  * itself. So the default delimiter: " \t", means " " or "\t".
200  * \param smilesColumn - column number for the SMILES string (defaults
201  * to the first column)
202  * \param nameColumn - column number for the molecule name (defaults to
203  * the second column) If set to -1 we assume that no name is
204  * available for the molecule and the name is defaulted to the
205  * smiles string
206  * \param titleLine - if true, the first line is assumed to list the
207  * names of properties in order seperated by 'delimiter'. It is
208  * also assume that the 'SMILES' column and the 'name' column
209  * are not specified here if false - no title line is assumed
210  * and the properties are recorded as the "columnX" where "X" is
211  * the column number
212  * \param sanitize - if true sanitize the molecule before returning it
213  */
214  explicit SmilesMolSupplier(const std::string &fileName,
215  const std::string &delimiter = " \t",
216  int smilesColumn = 0, int nameColumn = 1,
217  bool titleLine = true, bool sanitize = true);
219  explicit SmilesMolSupplier(std::istream *inStream, bool takeOwnership = true,
220  const std::string &delimiter = " \t",
221  int smilesColumn = 0, int nameColumn = 1,
222  bool titleLine = true, bool sanitize = true);
223 
225  void setData(const std::string &text, const std::string &delimiter = " ",
226  int smilesColumn = 0, int nameColumn = 1, bool titleLine = true,
227  bool sanitize = true);
228  void init();
229  void reset();
230  ROMol *next();
231  bool atEnd();
232  void moveTo(unsigned int idx);
233  ROMol *operator[](unsigned int idx);
234  /*! \brief returns the text block for a particular item
235  *
236  * \param idx - which item to return
237  */
238  std::string getItemText(unsigned int idx);
239  unsigned int length();
240 
241  private:
242  ROMol *processLine(std::string inLine);
243  void processTitleLine();
244  std::string nextLine();
245  long int skipComments();
246  void checkForEnd();
247 
248  bool df_end; // have we reached the end of the file?
249  int d_len; // total number of smiles in the file
250  int d_next; // the molecule we are ready to read
251  int d_line; // line number we are currently on
252  std::vector<std::streampos>
253  d_molpos; // vector of positions in the file for molecules
254  std::vector<int> d_lineNums;
255  std::string d_delim; // the delimiter string
256  bool df_sanitize; // sanitize molecules before returning them?
257  STR_VECT d_props; // vector of property names
258  bool df_title; // do we have a title line?
259  int d_smi; // column id for the smile string
260  int d_name; // column id for the name
261 };
262 
263 //! lazy file parser for TDT files
264 class TDTMolSupplier : public MolSupplier {
265  /**************************************************************************
266  * Lazy file parser for TDT files, similar to the lazy SD
267  * file parser above
268  * - As an when new molecules are read using "next" their
269  * positions in the file are noted.
270  * - A call to the "length" will autamatically parse the entire
271  * file and cache all the mol block positions
272  * - [] operator is used to access a molecule at "idx", calling
273  * next following this will result in the next molecule after
274  * "idx"
275  ***************************************************************************/
276  public:
277  /*!
278  * \param fileName - the name of the TDT file
279  * \param nameRecord - property name for the molecule name.
280  * If empty (the default), the name defaults to be empty
281  * \param confId2D - if >=0 and 2D coordinates are provided, the 2D
282  * structure (depiction) in the input will be read into the
283  * corresponding conformer id.
284  * \param confId3D - if >=0 and 3D coordinates are provided, the 3D
285  * structure (depiction) in the input will be read into the
286  * corresponding conformer id.
287  * \param sanitize - if true sanitize the molecule before returning it
288  */
289  explicit TDTMolSupplier(const std::string &fileName,
290  const std::string &nameRecord = "", int confId2D = -1,
291  int confId3D = 0, bool sanitize = true);
292  explicit TDTMolSupplier(std::istream *inStream, bool takeOwnership = true,
293  const std::string &nameRecord = "", int confId2D = -1,
294  int confId3D = 0, bool sanitize = true);
295  TDTMolSupplier();
296  ~TDTMolSupplier();
297  void setData(const std::string &text, const std::string &nameRecord = "",
298  int confId2D = -1, int confId3D = 0, bool sanitize = true);
299  void init();
300  void reset();
301  ROMol *next();
302  bool atEnd();
303  void moveTo(unsigned int idx);
304  ROMol *operator[](unsigned int idx);
305  /*! \brief returns the text block for a particular item
306  *
307  * \param idx - which item to return
308  */
309  std::string getItemText(unsigned int idx);
310  unsigned int length();
311 
312  private:
313  bool advanceToNextRecord();
314  void checkForEnd();
315  ROMol *parseMol(std::string inLine);
316 
317  bool df_end; // have we reached the end of the file?
318  int d_len; // total number of mols in the file
319  int d_last; // the molecule we are ready to read
320  int d_line; // line number we are currently on
321  int d_confId2D; // id to use for 2D conformers
322  int d_confId3D; // id to use for 3D conformers
323  std::vector<std::streampos>
324  d_molpos; // vector of positions in the file for molecules
325  bool df_sanitize; // sanitize molecules before returning them?
326  std::string d_nameProp; // local storage for the property providing mol names
327 };
328 
329 //! lazy file parser for PDB files
330 class PDBMolSupplier : public MolSupplier {
331  public:
332  explicit PDBMolSupplier(std::istream *inStream, bool takeOwnership = true,
333  bool sanitize = true, bool removeHs = true,
334  unsigned int flavor = 0);
335  explicit PDBMolSupplier(const std::string &fname, bool sanitize = true,
336  bool removeHs = true, unsigned int flavor = 0);
337 
338  virtual ~PDBMolSupplier() {
339  if (df_owner && dp_inStream) delete dp_inStream;
340  };
341 
342  virtual void init();
343  virtual void reset();
344  virtual ROMol *next();
345  virtual bool atEnd();
346 
347  protected:
348  bool df_sanitize, df_removeHs;
349  unsigned int d_flavor;
350 };
351 }
352 
353 #endif
ROMol * removeHs(const ROMol &mol, bool implicitOnly=false, bool updateExplicitCount=false, bool sanitize=true)
returns a copy of a molecule with hydrogens removed
virtual ~MolSupplier()
Definition: MolSupplier.h:49
unsigned int d_flavor
Definition: MolSupplier.h:349
virtual ~PDBMolSupplier()
Definition: MolSupplier.h:338
virtual ROMol * next()=0
lazy file parser for TDT files
Definition: MolSupplier.h:264
virtual void reset()=0
Defines the primary molecule class ROMol as well as associated typedefs.
ROMol is a molecule class that is intended to have a fixed topology.
Definition: ROMol.h:103
std::string strip(const std::string &orig)
std::istream * dp_inStream
Definition: MolSupplier.h:65
Includes a bunch of functionality for handling Atom and Bond queries.
Definition: Atom.h:29
lazy file parser for Smiles tables
Definition: MolSupplier.h:181
lazy file parser for PDB files
Definition: MolSupplier.h:330
virtual bool atEnd()=0
std::vector< std::string > STR_VECT
Definition: Dict.h:26
virtual void init()=0