MolSupplier.h

Go to the documentation of this file.
00001 //
00002 //  Copyright (C) 2002-2006 greg landrum and Rational Discovery LLC
00003 //
00004 //   @@ All Rights Reserved  @@
00005 //
00006 #ifndef _RD_MOLSUPPLIER_H
00007 #define _RD_MOLSUPPLIER_H
00008 
00009 #include <RDGeneral/types.h>
00010 
00011 #include <string>
00012 #include <iostream>
00013 #include <GraphMol/ROMol.h>
00014 
00015 namespace RDKit {
00016 
00017   /*! 
00018   //
00019   //  Here are a couple of ways one can interact with MolSuppliers:
00020   //
00021   //  1) Lazy (ForwardIterator):
00022   //     while(!supplier.atEnd()){
00023   //       ROMol *mol = supplier.next();
00024   //       if(mol){
00025   //           do something;
00026   //       }
00027   //     }
00028   //  2) Random Access:
00029   //     for(int i=0;i<supplier.length();i++){
00030   //       ROMol *mol = supplier[i];
00031   //       if(mol){
00032   //           do something;
00033   //       }
00034   //     }
00035   //
00036   //
00037   */
00038   class MolSupplier {
00039     // this is an abstract base class to supply molecules one at a time
00040   public:
00041     MolSupplier() {};
00042     virtual ~MolSupplier() {};
00043     virtual void init() = 0;
00044     virtual void reset() = 0;
00045     virtual bool atEnd() = 0;
00046     virtual ROMol *next() = 0;
00047 
00048   private:
00049     // disable automatic copy constructors and assignment operators
00050     // for this class and its subclasses.  They will likely be
00051     // carrying around stream pointers and copying those is a recipe
00052     // for disaster.
00053     MolSupplier(const MolSupplier&);
00054     MolSupplier &operator=(const MolSupplier&);
00055 
00056   };
00057 
00058 
00059   // \brief a lazy supplier from an SD file
00060   class SDMolSupplier : public MolSupplier {
00061     /*************************************************************************
00062      * A lazy mol supplier from a SD file. 
00063      *  - When new molecules are read using "next" their positions in the file are noted. 
00064      *  - A call to the "length" will automatically parse the entire file and cache all the mol
00065      *    block positions
00066      *  - [] operator is used to access a molecule at "idx", calling next following this will result
00067      *    in the next molecule after "idx"
00068      ***********************************************************************************/
00069 
00070   public:
00071     SDMolSupplier() { init(); };
00072 
00073     /*! 
00074      *   \param fileName - the name of the SD file
00075      *   \param sanitize - if true sanitize the molecule before returning it
00076      *   \param removeHs - if true remove Hs from the molecule before returning it
00077      *                     (triggers sanitization)
00078      */
00079     explicit SDMolSupplier(const std::string &fileName, bool sanitize=true,
00080                            bool removeHs=true);
00081     
00082     ~SDMolSupplier();
00083     void init();
00084     void reset();
00085     ROMol *next();
00086     bool atEnd(); 
00087     void moveTo(unsigned int idx);
00088     ROMol * operator[](unsigned int idx);
00089     /*! \brief returns the text block for a particular item
00090      *  
00091      *  \param idx - which item to return
00092      */
00093     std::string getItemText(unsigned int idx);
00094     unsigned int length();
00095     void setData(const std::string &text,bool sanitize=true, bool removeHs=true);
00096 
00097     /*! Resets our internal state and sets the indices of molecules in the stream.
00098      *  The client should be *very* careful about calling this method, as it's trivial
00099      *  to end up with a completely useless supplier.
00100      *
00101      *   \param locs - the vector of stream positions.
00102      *
00103      *  Note that this can be used not only to make reading selected molecules from a
00104      *  large SD file much faster, but it can also allow subsetting an SD file or
00105      *  rearranging the order of the molecules.
00106      */
00107     void setStreamIndices(const std::vector<std::streampos> &locs);
00108 
00109   private :
00110     void readMolProps(ROMol *);
00111     void checkForEnd();
00112     std::istream *dp_inStream;
00113     bool df_owner;
00114     bool df_end; 
00115     int d_len; // total number of mol blocks in the file (initialized to -1)
00116     int d_last; // the molecule we are ready to read
00117     int d_line; // line number we are currently on
00118 #ifdef USEZIPSTREAM
00119     std::istream *dp_streamHolder;
00120 #endif    
00121     std::vector<std::streampos> d_molpos;
00122     bool df_sanitize,df_removeHs;
00123   };
00124 
00125   //! lazy file parser for Smiles tables
00126   class SmilesMolSupplier : public MolSupplier {
00127     /**************************************************************************
00128      * Lazy file parser for Smiles table file, similar to the lazy SD
00129      * file parser above
00130      * - As an when new molecules are read using "next" their
00131      *    positions in the file are noted.
00132      *  - A call to the "length" will autamatically parse the entire
00133      *    file and cache all the mol block positions
00134      *  - [] operator is used to access a molecule at "idx", calling
00135      *    next following this will result in the next molecule after
00136      *    "idx"
00137      ***************************************************************************/ 
00138   public:
00139 
00140     /*! 
00141      *   \param fileName - the name of smiles table file
00142      *   \param delimiter - delimiting characters between records on a each
00143      *     line NOTE that this is not a string, the tokenizer looks for
00144      *     the individual characters in delimiter, not the full string
00145      *     itself.  So the default delimiter: " \t", means " " or "\t".
00146      *   \param smilesColumn - column number for the SMILES string (defaults
00147      *     to the first column)
00148      *   \param nameColumn - column number for the molecule name (defaults to
00149      *     the second column) If set to -1 we assume that no name is
00150      *     available for the molecule and the name is defaulted to the
00151      *     smiles string
00152      *   \param titleLine - if true, the first line is assumed to list the
00153      *     names of properties in order seperated by 'delimiter'. It is
00154      *     also assume that the 'SMILES' column and the 'name' column
00155      *     are not specified here if false - no title line is assumed
00156      *     and the properties are recorded as the "columnX" where "X" is
00157      *     the cloumn number
00158      *   \param sanitize - if true sanitize the molecule before returning it
00159      */
00160     explicit SmilesMolSupplier(const std::string &fileName, 
00161                                const std::string &delimiter=" \t",
00162                                int smilesColumn=0,
00163                                int nameColumn=1, 
00164                                bool titleLine=true,                
00165                                bool sanitize=true);
00166     SmilesMolSupplier();
00167     ~SmilesMolSupplier();
00168     void setData(const std::string &text,
00169                  const std::string &delimiter=" ",
00170                  int smilesColumn=0,
00171                  int nameColumn=1, 
00172                  bool titleLine=true,              
00173                  bool sanitize=true);
00174     void init();
00175     void reset();
00176     ROMol *next();
00177     bool atEnd();
00178     void moveTo(unsigned int idx);
00179     ROMol * operator[](unsigned int idx);
00180     /*! \brief returns the text block for a particular item
00181      *  
00182      *  \param idx - which item to return
00183      */
00184     std::string getItemText(unsigned int idx);
00185     unsigned int length();
00186 
00187   private:
00188     ROMol *processLine(std::string inLine);
00189     void processTitleLine();
00190     std::string nextLine();
00191     int skipComments();
00192     void checkForEnd();
00193     
00194     // stream to read the molecules from:
00195     std::istream *dp_inStream; 
00196 
00197     // do we own dp_inStream - right now this is always true since we
00198     // are passed in a file name. But later we should have a
00199     // constructor that can take a 'istream'
00200     bool df_owner; 
00201 
00202     bool df_end; // have we reached the end of the file?
00203     int d_len; // total number of smiles in the file
00204     int d_next; // the  molecule we are ready to read
00205     int d_line; // line number we are currently on
00206     std::vector<std::streampos> d_molpos; // vector of positions in the file for molecules
00207     std::vector<int> d_lineNums; 
00208     std::string d_delim; // the delimiter string
00209     bool df_sanitize; // sanitize molecules before returning them?
00210     STR_VECT d_props; // vector of property names
00211     bool df_title; // do we have a title line?
00212     int d_smi; // column id for the smile string
00213     int d_name; // column id for the name
00214   };
00215 
00216   //! lazy file parser for TDT files
00217   class TDTMolSupplier : public MolSupplier {
00218     /**************************************************************************
00219      * Lazy file parser for TDT files, similar to the lazy SD
00220      * file parser above
00221      * - As an when new molecules are read using "next" their
00222      *    positions in the file are noted.
00223      *  - A call to the "length" will autamatically parse the entire
00224      *    file and cache all the mol block positions
00225      *  - [] operator is used to access a molecule at "idx", calling
00226      *    next following this will result in the next molecule after
00227      *    "idx"
00228      ***************************************************************************/ 
00229   public:
00230 
00231     /*! 
00232      *   \param fileName - the name of the TDT file
00233      *   \param nameRecord - property name for the molecule name.
00234      *     If empty (the default), the name defaults to be empty
00235      *   \param confId2D - if >=0 and 2D coordinates are provided, the 2D
00236      *                   structure (depiction) in the input will be read into the
00237      *                   corresponding conformer id.
00238      *   \param confId3D - if >=0 and 3D coordinates are provided, the 3D
00239      *                   structure (depiction) in the input will be read into the
00240      *                   corresponding conformer id.
00241      *   \param sanitize - if true sanitize the molecule before returning it
00242      */
00243     explicit TDTMolSupplier(const std::string &fileName, 
00244                             const std::string &nameRecord="",
00245                             int confId2D=-1,int confId3D=0,
00246                             bool sanitize=true);
00247     TDTMolSupplier();
00248     ~TDTMolSupplier();
00249     void setData(const std::string &text,
00250                  const std::string &nameRecord="",
00251                  int confId2D=-1,int confId3D=0,
00252                  bool sanitize=true);
00253     void init();
00254     void reset();
00255     ROMol *next();
00256     bool atEnd();
00257     void moveTo(unsigned int idx);
00258     ROMol * operator[](unsigned int idx);
00259     /*! \brief returns the text block for a particular item
00260      *  
00261      *  \param idx - which item to return
00262      */
00263     std::string getItemText(unsigned int idx);
00264     unsigned int length();
00265 
00266   private:
00267     bool advanceToNextRecord();
00268     void checkForEnd();
00269     ROMol *parseMol(std::string inLine);
00270     
00271     // stream to read the molecules from:
00272     std::istream *dp_inStream; 
00273 
00274     // do we own dp_inStream - right now this is always true since we
00275     // are passed in a file name. But later we should have a
00276     // constructor that can take a 'istream'
00277     bool df_owner; 
00278 
00279     bool df_end; // have we reached the end of the file?
00280     int d_len; // total number of mols in the file
00281     int d_next; // the  molecule we are ready to read
00282     int d_last; // the molecule we are ready to read
00283     int d_line; // line number we are currently on
00284     int d_confId2D; // id to use for 2D conformers
00285     int d_confId3D; // id to use for 3D conformers
00286     std::vector<std::streampos> d_molpos; // vector of positions in the file for molecules
00287     bool df_sanitize; // sanitize molecules before returning them?
00288     std::string d_nameProp; // local storage for the property providing mol names
00289   };
00290 
00291 }
00292 
00293 #endif

Generated on Sat May 24 08:36:32 2008 for RDCode by  doxygen 1.5.3