00001 // 00002 // Copyright (C) 2002-2006 greg landrum and Rational Discovery LLC 00003 // 00004 // @@ All Rights Reserved @@ 00005 // 00006 #ifndef _RD_MOLSUPPLIER_H 00007 #define _RD_MOLSUPPLIER_H 00008 00009 #include <RDGeneral/types.h> 00010 00011 #include <string> 00012 #include <iostream> 00013 #include <GraphMol/ROMol.h> 00014 00015 namespace RDKit { 00016 00017 /*! 00018 // 00019 // Here are a couple of ways one can interact with MolSuppliers: 00020 // 00021 // 1) Lazy (ForwardIterator): 00022 // while(!supplier.atEnd()){ 00023 // ROMol *mol = supplier.next(); 00024 // if(mol){ 00025 // do something; 00026 // } 00027 // } 00028 // 2) Random Access: 00029 // for(int i=0;i<supplier.length();i++){ 00030 // ROMol *mol = supplier[i]; 00031 // if(mol){ 00032 // do something; 00033 // } 00034 // } 00035 // 00036 // 00037 */ 00038 class MolSupplier { 00039 // this is an abstract base class to supply molecules one at a time 00040 public: 00041 MolSupplier() {}; 00042 virtual ~MolSupplier() {}; 00043 virtual void init() = 0; 00044 virtual void reset() = 0; 00045 virtual bool atEnd() = 0; 00046 virtual ROMol *next() = 0; 00047 00048 private: 00049 // disable automatic copy constructors and assignment operators 00050 // for this class and its subclasses. They will likely be 00051 // carrying around stream pointers and copying those is a recipe 00052 // for disaster. 00053 MolSupplier(const MolSupplier&); 00054 MolSupplier &operator=(const MolSupplier&); 00055 00056 }; 00057 00058 00059 // \brief a lazy supplier from an SD file 00060 class SDMolSupplier : public MolSupplier { 00061 /************************************************************************* 00062 * A lazy mol supplier from a SD file. 00063 * - When new molecules are read using "next" their positions in the file are noted. 00064 * - A call to the "length" will automatically parse the entire file and cache all the mol 00065 * block positions 00066 * - [] operator is used to access a molecule at "idx", calling next following this will result 00067 * in the next molecule after "idx" 00068 ***********************************************************************************/ 00069 00070 public: 00071 SDMolSupplier() { init(); }; 00072 00073 /*! 00074 * \param fileName - the name of the SD file 00075 * \param sanitize - if true sanitize the molecule before returning it 00076 * \param removeHs - if true remove Hs from the molecule before returning it 00077 * (triggers sanitization) 00078 */ 00079 explicit SDMolSupplier(const std::string &fileName, bool sanitize=true, 00080 bool removeHs=true); 00081 00082 ~SDMolSupplier(); 00083 void init(); 00084 void reset(); 00085 ROMol *next(); 00086 bool atEnd(); 00087 void moveTo(unsigned int idx); 00088 ROMol * operator[](unsigned int idx); 00089 /*! \brief returns the text block for a particular item 00090 * 00091 * \param idx - which item to return 00092 */ 00093 std::string getItemText(unsigned int idx); 00094 unsigned int length(); 00095 void setData(const std::string &text,bool sanitize=true, bool removeHs=true); 00096 00097 /*! Resets our internal state and sets the indices of molecules in the stream. 00098 * The client should be *very* careful about calling this method, as it's trivial 00099 * to end up with a completely useless supplier. 00100 * 00101 * \param locs - the vector of stream positions. 00102 * 00103 * Note that this can be used not only to make reading selected molecules from a 00104 * large SD file much faster, but it can also allow subsetting an SD file or 00105 * rearranging the order of the molecules. 00106 */ 00107 void setStreamIndices(const std::vector<std::streampos> &locs); 00108 00109 private : 00110 void readMolProps(ROMol *); 00111 void checkForEnd(); 00112 std::istream *dp_inStream; 00113 bool df_owner; 00114 bool df_end; 00115 int d_len; // total number of mol blocks in the file (initialized to -1) 00116 int d_last; // the molecule we are ready to read 00117 int d_line; // line number we are currently on 00118 #ifdef USEZIPSTREAM 00119 std::istream *dp_streamHolder; 00120 #endif 00121 std::vector<std::streampos> d_molpos; 00122 bool df_sanitize,df_removeHs; 00123 }; 00124 00125 //! lazy file parser for Smiles tables 00126 class SmilesMolSupplier : public MolSupplier { 00127 /************************************************************************** 00128 * Lazy file parser for Smiles table file, similar to the lazy SD 00129 * file parser above 00130 * - As an when new molecules are read using "next" their 00131 * positions in the file are noted. 00132 * - A call to the "length" will autamatically parse the entire 00133 * file and cache all the mol block positions 00134 * - [] operator is used to access a molecule at "idx", calling 00135 * next following this will result in the next molecule after 00136 * "idx" 00137 ***************************************************************************/ 00138 public: 00139 00140 /*! 00141 * \param fileName - the name of smiles table file 00142 * \param delimiter - delimiting characters between records on a each 00143 * line NOTE that this is not a string, the tokenizer looks for 00144 * the individual characters in delimiter, not the full string 00145 * itself. So the default delimiter: " \t", means " " or "\t". 00146 * \param smilesColumn - column number for the SMILES string (defaults 00147 * to the first column) 00148 * \param nameColumn - column number for the molecule name (defaults to 00149 * the second column) If set to -1 we assume that no name is 00150 * available for the molecule and the name is defaulted to the 00151 * smiles string 00152 * \param titleLine - if true, the first line is assumed to list the 00153 * names of properties in order seperated by 'delimiter'. It is 00154 * also assume that the 'SMILES' column and the 'name' column 00155 * are not specified here if false - no title line is assumed 00156 * and the properties are recorded as the "columnX" where "X" is 00157 * the cloumn number 00158 * \param sanitize - if true sanitize the molecule before returning it 00159 */ 00160 explicit SmilesMolSupplier(const std::string &fileName, 00161 const std::string &delimiter=" \t", 00162 int smilesColumn=0, 00163 int nameColumn=1, 00164 bool titleLine=true, 00165 bool sanitize=true); 00166 SmilesMolSupplier(); 00167 ~SmilesMolSupplier(); 00168 void setData(const std::string &text, 00169 const std::string &delimiter=" ", 00170 int smilesColumn=0, 00171 int nameColumn=1, 00172 bool titleLine=true, 00173 bool sanitize=true); 00174 void init(); 00175 void reset(); 00176 ROMol *next(); 00177 bool atEnd(); 00178 void moveTo(unsigned int idx); 00179 ROMol * operator[](unsigned int idx); 00180 /*! \brief returns the text block for a particular item 00181 * 00182 * \param idx - which item to return 00183 */ 00184 std::string getItemText(unsigned int idx); 00185 unsigned int length(); 00186 00187 private: 00188 ROMol *processLine(std::string inLine); 00189 void processTitleLine(); 00190 std::string nextLine(); 00191 int skipComments(); 00192 void checkForEnd(); 00193 00194 // stream to read the molecules from: 00195 std::istream *dp_inStream; 00196 00197 // do we own dp_inStream - right now this is always true since we 00198 // are passed in a file name. But later we should have a 00199 // constructor that can take a 'istream' 00200 bool df_owner; 00201 00202 bool df_end; // have we reached the end of the file? 00203 int d_len; // total number of smiles in the file 00204 int d_next; // the molecule we are ready to read 00205 int d_line; // line number we are currently on 00206 std::vector<std::streampos> d_molpos; // vector of positions in the file for molecules 00207 std::vector<int> d_lineNums; 00208 std::string d_delim; // the delimiter string 00209 bool df_sanitize; // sanitize molecules before returning them? 00210 STR_VECT d_props; // vector of property names 00211 bool df_title; // do we have a title line? 00212 int d_smi; // column id for the smile string 00213 int d_name; // column id for the name 00214 }; 00215 00216 //! lazy file parser for TDT files 00217 class TDTMolSupplier : public MolSupplier { 00218 /************************************************************************** 00219 * Lazy file parser for TDT files, similar to the lazy SD 00220 * file parser above 00221 * - As an when new molecules are read using "next" their 00222 * positions in the file are noted. 00223 * - A call to the "length" will autamatically parse the entire 00224 * file and cache all the mol block positions 00225 * - [] operator is used to access a molecule at "idx", calling 00226 * next following this will result in the next molecule after 00227 * "idx" 00228 ***************************************************************************/ 00229 public: 00230 00231 /*! 00232 * \param fileName - the name of the TDT file 00233 * \param nameRecord - property name for the molecule name. 00234 * If empty (the default), the name defaults to be empty 00235 * \param confId2D - if >=0 and 2D coordinates are provided, the 2D 00236 * structure (depiction) in the input will be read into the 00237 * corresponding conformer id. 00238 * \param confId3D - if >=0 and 3D coordinates are provided, the 3D 00239 * structure (depiction) in the input will be read into the 00240 * corresponding conformer id. 00241 * \param sanitize - if true sanitize the molecule before returning it 00242 */ 00243 explicit TDTMolSupplier(const std::string &fileName, 00244 const std::string &nameRecord="", 00245 int confId2D=-1,int confId3D=0, 00246 bool sanitize=true); 00247 TDTMolSupplier(); 00248 ~TDTMolSupplier(); 00249 void setData(const std::string &text, 00250 const std::string &nameRecord="", 00251 int confId2D=-1,int confId3D=0, 00252 bool sanitize=true); 00253 void init(); 00254 void reset(); 00255 ROMol *next(); 00256 bool atEnd(); 00257 void moveTo(unsigned int idx); 00258 ROMol * operator[](unsigned int idx); 00259 /*! \brief returns the text block for a particular item 00260 * 00261 * \param idx - which item to return 00262 */ 00263 std::string getItemText(unsigned int idx); 00264 unsigned int length(); 00265 00266 private: 00267 bool advanceToNextRecord(); 00268 void checkForEnd(); 00269 ROMol *parseMol(std::string inLine); 00270 00271 // stream to read the molecules from: 00272 std::istream *dp_inStream; 00273 00274 // do we own dp_inStream - right now this is always true since we 00275 // are passed in a file name. But later we should have a 00276 // constructor that can take a 'istream' 00277 bool df_owner; 00278 00279 bool df_end; // have we reached the end of the file? 00280 int d_len; // total number of mols in the file 00281 int d_next; // the molecule we are ready to read 00282 int d_last; // the molecule we are ready to read 00283 int d_line; // line number we are currently on 00284 int d_confId2D; // id to use for 2D conformers 00285 int d_confId3D; // id to use for 3D conformers 00286 std::vector<std::streampos> d_molpos; // vector of positions in the file for molecules 00287 bool df_sanitize; // sanitize molecules before returning them? 00288 std::string d_nameProp; // local storage for the property providing mol names 00289 }; 00290 00291 } 00292 00293 #endif
1.5.3