00001 // 00002 // Copyright (C) 2002-2008 greg landrum and Rational Discovery LLC 00003 // 00004 // @@ All Rights Reserved @@ 00005 // 00006 #ifndef _RD_MOLSUPPLIER_H 00007 #define _RD_MOLSUPPLIER_H 00008 00009 #include <RDGeneral/types.h> 00010 00011 #include <string> 00012 #include <iostream> 00013 #include <GraphMol/ROMol.h> 00014 00015 namespace RDKit { 00016 00017 /*! 00018 // 00019 // Here are a couple of ways one can interact with MolSuppliers: 00020 // 00021 // 1) Lazy (ForwardIterator): 00022 // while(!supplier.atEnd()){ 00023 // ROMol *mol = supplier.next(); 00024 // if(mol){ 00025 // do something; 00026 // } 00027 // } 00028 // 2) Random Access: 00029 // for(int i=0;i<supplier.length();i++){ 00030 // ROMol *mol = supplier[i]; 00031 // if(mol){ 00032 // do something; 00033 // } 00034 // } 00035 // 00036 // 00037 */ 00038 class MolSupplier { 00039 // this is an abstract base class to supply molecules one at a time 00040 public: 00041 MolSupplier() {}; 00042 virtual ~MolSupplier() {}; 00043 virtual void init() = 0; 00044 virtual void reset() = 0; 00045 virtual bool atEnd() = 0; 00046 virtual ROMol *next() = 0; 00047 00048 private: 00049 // disable automatic copy constructors and assignment operators 00050 // for this class and its subclasses. They will likely be 00051 // carrying around stream pointers and copying those is a recipe 00052 // for disaster. 00053 MolSupplier(const MolSupplier&); 00054 MolSupplier &operator=(const MolSupplier&); 00055 protected: 00056 // stream to read the molecules from: 00057 std::istream *dp_inStream; 00058 // do we own dp_inStream? 00059 bool df_owner; 00060 }; 00061 00062 00063 // \brief a lazy supplier from an SD file 00064 class SDMolSupplier : public MolSupplier { 00065 /************************************************************************* 00066 * A lazy mol supplier from a SD file. 00067 * - When new molecules are read using "next" their positions in the file are noted. 00068 * - A call to the "length" will automatically parse the entire file and cache all the mol 00069 * block positions 00070 * - [] operator is used to access a molecule at "idx", calling next following this will result 00071 * in the next molecule after "idx" 00072 ***********************************************************************************/ 00073 00074 public: 00075 SDMolSupplier() { init(); }; 00076 00077 /*! 00078 * \param fileName - the name of the SD file 00079 * \param sanitize - if true sanitize the molecule before returning it 00080 * \param removeHs - if true remove Hs from the molecule before returning it 00081 * (triggers sanitization) 00082 */ 00083 explicit SDMolSupplier(const std::string &fileName, bool sanitize=true, 00084 bool removeHs=true); 00085 00086 explicit SDMolSupplier(std::istream *inStream, bool takeOwnership=true, 00087 bool sanitize=true,bool removeHs=true); 00088 00089 00090 ~SDMolSupplier(); 00091 void init(); 00092 void reset(); 00093 ROMol *next(); 00094 bool atEnd(); 00095 void moveTo(unsigned int idx); 00096 ROMol * operator[](unsigned int idx); 00097 /*! \brief returns the text block for a particular item 00098 * 00099 * \param idx - which item to return 00100 */ 00101 std::string getItemText(unsigned int idx); 00102 unsigned int length(); 00103 void setData(const std::string &text,bool sanitize=true, bool removeHs=true); 00104 00105 /*! Resets our internal state and sets the indices of molecules in the stream. 00106 * The client should be *very* careful about calling this method, as it's trivial 00107 * to end up with a completely useless supplier. 00108 * 00109 * \param locs - the vector of stream positions. 00110 * 00111 * Note that this can be used not only to make reading selected molecules from a 00112 * large SD file much faster, but it can also allow subsetting an SD file or 00113 * rearranging the order of the molecules. 00114 */ 00115 void setStreamIndices(const std::vector<std::streampos> &locs); 00116 00117 private : 00118 void readMolProps(ROMol *); 00119 void checkForEnd(); 00120 bool df_end; 00121 int d_len; // total number of mol blocks in the file (initialized to -1) 00122 int d_last; // the molecule we are ready to read 00123 int d_line; // line number we are currently on 00124 std::vector<std::streampos> d_molpos; 00125 bool df_sanitize,df_removeHs; 00126 }; 00127 00128 //! lazy file parser for Smiles tables 00129 class SmilesMolSupplier : public MolSupplier { 00130 /************************************************************************** 00131 * Lazy file parser for Smiles table file, similar to the lazy SD 00132 * file parser above 00133 * - As an when new molecules are read using "next" their 00134 * positions in the file are noted. 00135 * - A call to the "length" will autamatically parse the entire 00136 * file and cache all the mol block positions 00137 * - [] operator is used to access a molecule at "idx", calling 00138 * next following this will result in the next molecule after 00139 * "idx" 00140 ***************************************************************************/ 00141 public: 00142 00143 /*! 00144 * \param fileName - the name of smiles table file 00145 * \param delimiter - delimiting characters between records on a each 00146 * line NOTE that this is not a string, the tokenizer looks for 00147 * the individual characters in delimiter, not the full string 00148 * itself. So the default delimiter: " \t", means " " or "\t". 00149 * \param smilesColumn - column number for the SMILES string (defaults 00150 * to the first column) 00151 * \param nameColumn - column number for the molecule name (defaults to 00152 * the second column) If set to -1 we assume that no name is 00153 * available for the molecule and the name is defaulted to the 00154 * smiles string 00155 * \param titleLine - if true, the first line is assumed to list the 00156 * names of properties in order seperated by 'delimiter'. It is 00157 * also assume that the 'SMILES' column and the 'name' column 00158 * are not specified here if false - no title line is assumed 00159 * and the properties are recorded as the "columnX" where "X" is 00160 * the cloumn number 00161 * \param sanitize - if true sanitize the molecule before returning it 00162 */ 00163 explicit SmilesMolSupplier(const std::string &fileName, 00164 const std::string &delimiter=" \t", 00165 int smilesColumn=0, 00166 int nameColumn=1, 00167 bool titleLine=true, 00168 bool sanitize=true); 00169 SmilesMolSupplier(); 00170 explicit SmilesMolSupplier(std::istream *inStream, bool takeOwnership=true, 00171 const std::string &delimiter=" \t", 00172 int smilesColumn=0, 00173 int nameColumn=1, 00174 bool titleLine=true, 00175 bool sanitize=true); 00176 00177 ~SmilesMolSupplier(); 00178 void setData(const std::string &text, 00179 const std::string &delimiter=" ", 00180 int smilesColumn=0, 00181 int nameColumn=1, 00182 bool titleLine=true, 00183 bool sanitize=true); 00184 void init(); 00185 void reset(); 00186 ROMol *next(); 00187 bool atEnd(); 00188 void moveTo(unsigned int idx); 00189 ROMol * operator[](unsigned int idx); 00190 /*! \brief returns the text block for a particular item 00191 * 00192 * \param idx - which item to return 00193 */ 00194 std::string getItemText(unsigned int idx); 00195 unsigned int length(); 00196 00197 private: 00198 ROMol *processLine(std::string inLine); 00199 void processTitleLine(); 00200 std::string nextLine(); 00201 int skipComments(); 00202 void checkForEnd(); 00203 00204 bool df_end; // have we reached the end of the file? 00205 int d_len; // total number of smiles in the file 00206 int d_next; // the molecule we are ready to read 00207 int d_line; // line number we are currently on 00208 std::vector<std::streampos> d_molpos; // vector of positions in the file for molecules 00209 std::vector<int> d_lineNums; 00210 std::string d_delim; // the delimiter string 00211 bool df_sanitize; // sanitize molecules before returning them? 00212 STR_VECT d_props; // vector of property names 00213 bool df_title; // do we have a title line? 00214 int d_smi; // column id for the smile string 00215 int d_name; // column id for the name 00216 }; 00217 00218 //! lazy file parser for TDT files 00219 class TDTMolSupplier : public MolSupplier { 00220 /************************************************************************** 00221 * Lazy file parser for TDT files, similar to the lazy SD 00222 * file parser above 00223 * - As an when new molecules are read using "next" their 00224 * positions in the file are noted. 00225 * - A call to the "length" will autamatically parse the entire 00226 * file and cache all the mol block positions 00227 * - [] operator is used to access a molecule at "idx", calling 00228 * next following this will result in the next molecule after 00229 * "idx" 00230 ***************************************************************************/ 00231 public: 00232 00233 /*! 00234 * \param fileName - the name of the TDT file 00235 * \param nameRecord - property name for the molecule name. 00236 * If empty (the default), the name defaults to be empty 00237 * \param confId2D - if >=0 and 2D coordinates are provided, the 2D 00238 * structure (depiction) in the input will be read into the 00239 * corresponding conformer id. 00240 * \param confId3D - if >=0 and 3D coordinates are provided, the 3D 00241 * structure (depiction) in the input will be read into the 00242 * corresponding conformer id. 00243 * \param sanitize - if true sanitize the molecule before returning it 00244 */ 00245 explicit TDTMolSupplier(const std::string &fileName, 00246 const std::string &nameRecord="", 00247 int confId2D=-1,int confId3D=0, 00248 bool sanitize=true); 00249 explicit TDTMolSupplier(std::istream *inStream, bool takeOwnership=true, 00250 const std::string &nameRecord="", 00251 int confId2D=-1,int confId3D=0, 00252 bool sanitize=true); 00253 TDTMolSupplier(); 00254 ~TDTMolSupplier(); 00255 void setData(const std::string &text, 00256 const std::string &nameRecord="", 00257 int confId2D=-1,int confId3D=0, 00258 bool sanitize=true); 00259 void init(); 00260 void reset(); 00261 ROMol *next(); 00262 bool atEnd(); 00263 void moveTo(unsigned int idx); 00264 ROMol * operator[](unsigned int idx); 00265 /*! \brief returns the text block for a particular item 00266 * 00267 * \param idx - which item to return 00268 */ 00269 std::string getItemText(unsigned int idx); 00270 unsigned int length(); 00271 00272 private: 00273 bool advanceToNextRecord(); 00274 void checkForEnd(); 00275 ROMol *parseMol(std::string inLine); 00276 00277 bool df_end; // have we reached the end of the file? 00278 int d_len; // total number of mols in the file 00279 int d_next; // the molecule we are ready to read 00280 int d_last; // the molecule we are ready to read 00281 int d_line; // line number we are currently on 00282 int d_confId2D; // id to use for 2D conformers 00283 int d_confId3D; // id to use for 3D conformers 00284 std::vector<std::streampos> d_molpos; // vector of positions in the file for molecules 00285 bool df_sanitize; // sanitize molecules before returning them? 00286 std::string d_nameProp; // local storage for the property providing mol names 00287 }; 00288 00289 } 00290 00291 #endif
1.5.6