MolSupplier.h

Go to the documentation of this file.
00001 //
00002 //  Copyright (C) 2002-2008 greg landrum and Rational Discovery LLC
00003 //
00004 //   @@ All Rights Reserved  @@
00005 //
00006 #ifndef _RD_MOLSUPPLIER_H
00007 #define _RD_MOLSUPPLIER_H
00008 
00009 #include <RDGeneral/types.h>
00010 
00011 #include <string>
00012 #include <iostream>
00013 #include <GraphMol/ROMol.h>
00014 
00015 namespace RDKit {
00016 
00017   /*! 
00018   //
00019   //  Here are a couple of ways one can interact with MolSuppliers:
00020   //
00021   //  1) Lazy (ForwardIterator):
00022   //     while(!supplier.atEnd()){
00023   //       ROMol *mol = supplier.next();
00024   //       if(mol){
00025   //           do something;
00026   //       }
00027   //     }
00028   //  2) Random Access:
00029   //     for(int i=0;i<supplier.length();i++){
00030   //       ROMol *mol = supplier[i];
00031   //       if(mol){
00032   //           do something;
00033   //       }
00034   //     }
00035   //
00036   //
00037   */
00038   class MolSupplier {
00039     // this is an abstract base class to supply molecules one at a time
00040   public:
00041     MolSupplier() {};
00042     virtual ~MolSupplier() {};
00043     virtual void init() = 0;
00044     virtual void reset() = 0;
00045     virtual bool atEnd() = 0;
00046     virtual ROMol *next() = 0;
00047 
00048   private:
00049     // disable automatic copy constructors and assignment operators
00050     // for this class and its subclasses.  They will likely be
00051     // carrying around stream pointers and copying those is a recipe
00052     // for disaster.
00053     MolSupplier(const MolSupplier&);
00054     MolSupplier &operator=(const MolSupplier&);
00055   protected:
00056     // stream to read the molecules from:
00057     std::istream *dp_inStream;
00058     // do we own dp_inStream?
00059     bool df_owner; 
00060   };
00061 
00062 
00063   // \brief a lazy supplier from an SD file
00064   class SDMolSupplier : public MolSupplier {
00065     /*************************************************************************
00066      * A lazy mol supplier from a SD file. 
00067      *  - When new molecules are read using "next" their positions in the file are noted. 
00068      *  - A call to the "length" will automatically parse the entire file and cache all the mol
00069      *    block positions
00070      *  - [] operator is used to access a molecule at "idx", calling next following this will result
00071      *    in the next molecule after "idx"
00072      ***********************************************************************************/
00073 
00074   public:
00075     SDMolSupplier() { init(); };
00076 
00077     /*! 
00078      *   \param fileName - the name of the SD file
00079      *   \param sanitize - if true sanitize the molecule before returning it
00080      *   \param removeHs - if true remove Hs from the molecule before returning it
00081      *                     (triggers sanitization)
00082      */
00083     explicit SDMolSupplier(const std::string &fileName, bool sanitize=true,
00084                            bool removeHs=true);
00085     
00086     explicit SDMolSupplier(std::istream *inStream, bool takeOwnership=true,
00087                            bool sanitize=true,bool removeHs=true);
00088 
00089     
00090     ~SDMolSupplier();
00091     void init();
00092     void reset();
00093     ROMol *next();
00094     bool atEnd(); 
00095     void moveTo(unsigned int idx);
00096     ROMol * operator[](unsigned int idx);
00097     /*! \brief returns the text block for a particular item
00098      *  
00099      *  \param idx - which item to return
00100      */
00101     std::string getItemText(unsigned int idx);
00102     unsigned int length();
00103     void setData(const std::string &text,bool sanitize=true, bool removeHs=true);
00104 
00105     /*! Resets our internal state and sets the indices of molecules in the stream.
00106      *  The client should be *very* careful about calling this method, as it's trivial
00107      *  to end up with a completely useless supplier.
00108      *
00109      *   \param locs - the vector of stream positions.
00110      *
00111      *  Note that this can be used not only to make reading selected molecules from a
00112      *  large SD file much faster, but it can also allow subsetting an SD file or
00113      *  rearranging the order of the molecules.
00114      */
00115     void setStreamIndices(const std::vector<std::streampos> &locs);
00116 
00117   private :
00118     void readMolProps(ROMol *);
00119     void checkForEnd();
00120     bool df_end; 
00121     int d_len; // total number of mol blocks in the file (initialized to -1)
00122     int d_last; // the molecule we are ready to read
00123     int d_line; // line number we are currently on
00124     std::vector<std::streampos> d_molpos;
00125     bool df_sanitize,df_removeHs;
00126   };
00127 
00128   //! lazy file parser for Smiles tables
00129   class SmilesMolSupplier : public MolSupplier {
00130     /**************************************************************************
00131      * Lazy file parser for Smiles table file, similar to the lazy SD
00132      * file parser above
00133      * - As an when new molecules are read using "next" their
00134      *    positions in the file are noted.
00135      *  - A call to the "length" will autamatically parse the entire
00136      *    file and cache all the mol block positions
00137      *  - [] operator is used to access a molecule at "idx", calling
00138      *    next following this will result in the next molecule after
00139      *    "idx"
00140      ***************************************************************************/ 
00141   public:
00142 
00143     /*! 
00144      *   \param fileName - the name of smiles table file
00145      *   \param delimiter - delimiting characters between records on a each
00146      *     line NOTE that this is not a string, the tokenizer looks for
00147      *     the individual characters in delimiter, not the full string
00148      *     itself.  So the default delimiter: " \t", means " " or "\t".
00149      *   \param smilesColumn - column number for the SMILES string (defaults
00150      *     to the first column)
00151      *   \param nameColumn - column number for the molecule name (defaults to
00152      *     the second column) If set to -1 we assume that no name is
00153      *     available for the molecule and the name is defaulted to the
00154      *     smiles string
00155      *   \param titleLine - if true, the first line is assumed to list the
00156      *     names of properties in order seperated by 'delimiter'. It is
00157      *     also assume that the 'SMILES' column and the 'name' column
00158      *     are not specified here if false - no title line is assumed
00159      *     and the properties are recorded as the "columnX" where "X" is
00160      *     the cloumn number
00161      *   \param sanitize - if true sanitize the molecule before returning it
00162      */
00163     explicit SmilesMolSupplier(const std::string &fileName, 
00164                                const std::string &delimiter=" \t",
00165                                int smilesColumn=0,
00166                                int nameColumn=1, 
00167                                bool titleLine=true,                
00168                                bool sanitize=true);
00169     SmilesMolSupplier();
00170     explicit SmilesMolSupplier(std::istream *inStream, bool takeOwnership=true,
00171                                const std::string &delimiter=" \t",
00172                                int smilesColumn=0,
00173                                int nameColumn=1, 
00174                                bool titleLine=true,                
00175                                bool sanitize=true);                               
00176 
00177     ~SmilesMolSupplier();
00178     void setData(const std::string &text,
00179                  const std::string &delimiter=" ",
00180                  int smilesColumn=0,
00181                  int nameColumn=1, 
00182                  bool titleLine=true,              
00183                  bool sanitize=true);
00184     void init();
00185     void reset();
00186     ROMol *next();
00187     bool atEnd();
00188     void moveTo(unsigned int idx);
00189     ROMol * operator[](unsigned int idx);
00190     /*! \brief returns the text block for a particular item
00191      *  
00192      *  \param idx - which item to return
00193      */
00194     std::string getItemText(unsigned int idx);
00195     unsigned int length();
00196 
00197   private:
00198     ROMol *processLine(std::string inLine);
00199     void processTitleLine();
00200     std::string nextLine();
00201     int skipComments();
00202     void checkForEnd();
00203     
00204     bool df_end; // have we reached the end of the file?
00205     int d_len; // total number of smiles in the file
00206     int d_next; // the  molecule we are ready to read
00207     int d_line; // line number we are currently on
00208     std::vector<std::streampos> d_molpos; // vector of positions in the file for molecules
00209     std::vector<int> d_lineNums; 
00210     std::string d_delim; // the delimiter string
00211     bool df_sanitize; // sanitize molecules before returning them?
00212     STR_VECT d_props; // vector of property names
00213     bool df_title; // do we have a title line?
00214     int d_smi; // column id for the smile string
00215     int d_name; // column id for the name
00216   };
00217 
00218   //! lazy file parser for TDT files
00219   class TDTMolSupplier : public MolSupplier {
00220     /**************************************************************************
00221      * Lazy file parser for TDT files, similar to the lazy SD
00222      * file parser above
00223      * - As an when new molecules are read using "next" their
00224      *    positions in the file are noted.
00225      *  - A call to the "length" will autamatically parse the entire
00226      *    file and cache all the mol block positions
00227      *  - [] operator is used to access a molecule at "idx", calling
00228      *    next following this will result in the next molecule after
00229      *    "idx"
00230      ***************************************************************************/ 
00231   public:
00232 
00233     /*! 
00234      *   \param fileName - the name of the TDT file
00235      *   \param nameRecord - property name for the molecule name.
00236      *     If empty (the default), the name defaults to be empty
00237      *   \param confId2D - if >=0 and 2D coordinates are provided, the 2D
00238      *                   structure (depiction) in the input will be read into the
00239      *                   corresponding conformer id.
00240      *   \param confId3D - if >=0 and 3D coordinates are provided, the 3D
00241      *                   structure (depiction) in the input will be read into the
00242      *                   corresponding conformer id.
00243      *   \param sanitize - if true sanitize the molecule before returning it
00244      */
00245     explicit TDTMolSupplier(const std::string &fileName, 
00246                             const std::string &nameRecord="",
00247                             int confId2D=-1,int confId3D=0,
00248                             bool sanitize=true);
00249     explicit TDTMolSupplier(std::istream *inStream, bool takeOwnership=true,
00250                             const std::string &nameRecord="",
00251                             int confId2D=-1,int confId3D=0,
00252                             bool sanitize=true);
00253     TDTMolSupplier();
00254     ~TDTMolSupplier();
00255     void setData(const std::string &text,
00256                  const std::string &nameRecord="",
00257                  int confId2D=-1,int confId3D=0,
00258                  bool sanitize=true);
00259     void init();
00260     void reset();
00261     ROMol *next();
00262     bool atEnd();
00263     void moveTo(unsigned int idx);
00264     ROMol * operator[](unsigned int idx);
00265     /*! \brief returns the text block for a particular item
00266      *  
00267      *  \param idx - which item to return
00268      */
00269     std::string getItemText(unsigned int idx);
00270     unsigned int length();
00271 
00272   private:
00273     bool advanceToNextRecord();
00274     void checkForEnd();
00275     ROMol *parseMol(std::string inLine);
00276 
00277     bool df_end; // have we reached the end of the file?
00278     int d_len; // total number of mols in the file
00279     int d_next; // the  molecule we are ready to read
00280     int d_last; // the molecule we are ready to read
00281     int d_line; // line number we are currently on
00282     int d_confId2D; // id to use for 2D conformers
00283     int d_confId3D; // id to use for 3D conformers
00284     std::vector<std::streampos> d_molpos; // vector of positions in the file for molecules
00285     bool df_sanitize; // sanitize molecules before returning them?
00286     std::string d_nameProp; // local storage for the property providing mol names
00287   };
00288 
00289 }
00290 
00291 #endif

Generated on Fri Apr 3 06:03:02 2009 for RDCode by  doxygen 1.5.6