RDKit
Open-source cheminformatics and machine learning.
SubstructLibrary.h
Go to the documentation of this file.
1 // Copyright (c) 2017, Novartis Institutes for BioMedical Research Inc.
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
6 // met:
7 //
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following
12 // disclaimer in the documentation and/or other materials provided
13 // with the distribution.
14 // * Neither the name of Novartis Institutes for BioMedical Research Inc.
15 // nor the names of its contributors may be used to endorse or promote
16 // products derived from this software without specific prior written
17 // permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 //
31 #ifndef RDKIT_SUBSTRUCT_LIBRARY
32 #define RDKIT_SUBSTRUCT_LIBRARY
33 
34 #include <GraphMol/RDKitBase.h>
35 #include <GraphMol/MolPickler.h>
40 #include <DataStructs/BitOps.h>
41 
42 namespace RDKit {
43 
44 //! Base class API for holding molecules so substructure search.
45 /*!
46  This is an API that hides the implementation details used for
47  indexing molecules for substructure searching. It simply
48  provides an API for adding and getting molecules from a set.
49  */
51  public:
52  virtual ~MolHolderBase() {}
53 
54  //! Add a new molecule to the substructure search library
55  //! Returns the molecules index in the library
56  virtual unsigned int addMol( const ROMol &m ) = 0;
57 
58  // implementations should throw IndexError on out of range
59  virtual boost::shared_ptr<ROMol> getMol(unsigned int) const = 0;
60 
61  //! Get the current library size
62  virtual unsigned int size() const = 0;
63 };
64 
65 //! Concrete class that holds molecules in memory
66 /*!
67  This is currently one of the faster implementations.
68  However it is very memory intensive.
69 */
70 class MolHolder : public MolHolderBase {
71  std::vector<boost::shared_ptr<ROMol> > mols;
72 
73  public:
74  MolHolder() : MolHolderBase(), mols() {}
75 
76  virtual unsigned int addMol(const ROMol &m) {
77  mols.push_back(boost::make_shared<ROMol>(m));
78  return size() - 1;
79  }
80 
81  virtual boost::shared_ptr<ROMol> getMol(unsigned int idx) const {
82  if(idx >= mols.size())
83  throw IndexErrorException(idx);
84  return mols[idx];
85  }
86 
87  virtual unsigned int size() const {
88  return rdcast<unsigned int>(mols.size());
89  }
90 };
91 
92 //! Concrete class that holds binary cached molecules in memory
93 /*!
94  This implementation uses quite a bit less memory than the
95  non cached implementation. However, due to the reduced speed
96  it should be used in conjunction with a pattern fingerprinter.
97 
98  See RDKit::FPHolder
99 */
101  std::vector<std::string> mols;
102 
103  public:
105 
106  virtual unsigned int addMol(const ROMol &m) {
107  mols.push_back(std::string());
108  MolPickler::pickleMol(m, mols.back());
109  return size() - 1;
110  }
111 
112  //! Adds a pickled binary molecule, no validity checking of the input
113  //! is done.
114  unsigned int addBinary(const std::string &pickle) {
115  mols.push_back( pickle );
116  return size()-1;
117  }
118 
119  virtual boost::shared_ptr<ROMol> getMol(unsigned int idx) const {
120  if(idx >= mols.size())
121  throw IndexErrorException(idx);
122  boost::shared_ptr<ROMol> mol(new ROMol);
123  MolPickler::molFromPickle(mols[idx], mol.get());
124  return mol;
125  }
126 
127  virtual unsigned int size() const {
128  return rdcast<unsigned int>(mols.size());
129  }
130 };
131 
132 //! Concrete class that holds smiles strings in memory
133 /*!
134  This implementation uses quite a bit less memory than the
135  cached binary or uncached implementation. However, due to the
136  reduced speed it should be used in conjunction with a pattern
137  fingerprinter.
138 
139  See RDKit::FPHolder
140 */
142  std::vector<std::string> mols;
143 
144  public:
146 
147  virtual unsigned int addMol(const ROMol &m) {
148  bool doIsomericSmiles = true;
149  mols.push_back(MolToSmiles(m, doIsomericSmiles));
150  return size() - 1;
151  }
152 
153  //! Add a smiles to the dataset, no validation is done
154  //! to the inputs.
155  unsigned int addSmiles( const std::string &smiles ) {
156  mols.push_back(smiles);
157  return size() - 1;
158  }
159 
160  virtual boost::shared_ptr<ROMol> getMol(unsigned int idx) const {
161  if(idx >= mols.size())
162  throw IndexErrorException(idx);
163 
164  boost::shared_ptr<ROMol> mol(SmilesToMol(mols[idx]));
165  return mol;
166  }
167 
168  virtual unsigned int size() const {
169  return rdcast<unsigned int>(mols.size());
170  }
171 };
172 
173 //! Concrete class that holds trusted smiles strings in memory
174 /*!
175  A trusted smiles is essentially a smiles string that
176  RDKit has generated. This indicates that fewer
177  sanitization steps are required. See
178  http://rdkit.blogspot.com/2016/09/avoiding-unnecessary-work-and.html
179 
180  This implementation uses quite a bit less memory than the
181  cached binary or uncached implementation. However, due to the
182  reduced speed it should be used in conjunction with a pattern
183  fingerprinter.
184 
185  See RDKit::FPHolder
186 */
188  std::vector<std::string> mols;
189 
190  public:
192 
193  virtual unsigned int addMol(const ROMol &m) {
194  bool doIsomericSmiles = true;
195  mols.push_back(MolToSmiles(m, doIsomericSmiles));
196  return size() - 1;
197  }
198 
199  //! Add a smiles to the dataset, no validation is done
200  //! to the inputs.
201  unsigned int addSmiles( const std::string &smiles ) {
202  mols.push_back(smiles);
203  return size() - 1;
204  }
205 
206  virtual boost::shared_ptr<ROMol> getMol(unsigned int idx) const {
207  if(idx >= mols.size())
208  throw IndexErrorException(idx);
209 
210  RWMol *m = SmilesToMol(mols[idx], 0, false);
211  m->updatePropertyCache();
212  return boost::shared_ptr<ROMol>(m);
213  }
214 
215  virtual unsigned int size() const {
216  return rdcast<unsigned int>(mols.size());
217  }
218 };
219 
220 //! Base FPI for the fingerprinter used to rule out impossible matches
222  std::vector<ExplicitBitVect *> fps;
223 
224  public:
225  virtual ~FPHolderBase() {
226  for (size_t i = 0; i < fps.size(); ++i) delete fps[i];
227 
228  }
229 
230  //! Adds a molecule to the fingerprinter
231  unsigned int addMol( const ROMol &m) {
232  fps.push_back(makeFingerprint(m));
233  return rdcast<unsigned int>(fps.size() - 1);
234  }
235 
236  //! Adds a raw bit vector to the fingerprinter
237  unsigned int addFingerprint( const ExplicitBitVect &v ) {
238  fps.push_back( new ExplicitBitVect(v) );
239  return rdcast<unsigned int>(fps.size() - 1);
240  }
241 
242  //! Return false if a substructure search can never match the molecule
243  bool passesFilter(unsigned int idx, const ExplicitBitVect &query) const {
244  if(idx >= fps.size())
245  throw IndexErrorException(idx);
246 
247  return AllProbeBitsMatch(query, *fps[idx]);
248  }
249 
250  //! Get the bit vector at the specified index (throws IndexError if out of range)
251  const ExplicitBitVect &getFingerprint(unsigned int idx) const {
252  if(idx >= fps.size())
253  throw IndexErrorException(idx);
254  return *fps[idx];
255  }
256 
257  //! make the query vector
258  //! Caller owns the vector!
259  virtual ExplicitBitVect *makeFingerprint(const ROMol &m) const = 0;
260 };
261 
262 //! Uses the pattern fingerprinter to rule out matches
263 class PatternHolder : public FPHolderBase {
264 public:
265  //! Caller owns the vector!
266  virtual ExplicitBitVect *makeFingerprint(const ROMol &m) const {
267  return PatternFingerprintMol(m, 2048);
268  }
269 };
270 
271 //! Substtructure Search a library of molecules
272 /*! This class allows for multithreaded substructure searches os
273  large datasets.
274 
275  The implementations can use fingerprints to speed up searches
276  and have molecules cached as binary forms to reduce memory
277  usage.
278 */
280  boost::shared_ptr<MolHolderBase> molholder;
281  boost::shared_ptr<FPHolderBase> fpholder;
282  MolHolderBase *mols; // used for a small optimization
283  FPHolderBase *fps;
284 
285  public:
287  : molholder(new MolHolder),
288  fpholder(),
289  mols(molholder.get()),
290  fps(NULL) {}
291 
292  SubstructLibrary(boost::shared_ptr<MolHolderBase> molecules)
293  : molholder(molecules), fpholder(), mols(molholder.get()), fps(0) {}
294 
295  SubstructLibrary(boost::shared_ptr<MolHolderBase> molecules,
296  boost::shared_ptr<FPHolderBase> fingerprints)
297  : molholder(molecules),
298  fpholder(fingerprints),
299  mols(molholder.get()),
300  fps(fpholder.get()) {}
301 
302  //!Get the underlying molecule holder implementation
304  PRECONDITION(mols, "Molecule holder NULL in SubstructLibrary");
305  return *mols;
306  }
307 
308  const MolHolderBase & getMolecules() const {
309  PRECONDITION(mols, "Molecule holder NULL in SubstructLibrary");
310  return *mols;
311  }
312 
313  //!Get the underlying fingerprint implementation.
314  /*! Throws a value error if no fingerprints have been set */
316  if (!fps)
317  throw ValueErrorException("Substruct Library does not have fingerprints");
318  return *fps;
319  }
320 
321  const FPHolderBase & getFingerprints() const {
322  if (!fps)
323  throw ValueErrorException("Substruct Library does not have fingerprints");
324  return *fps;
325  }
326 
327  //! Add a molecule to the library
328  /*!
329  \param mol Molecule to add
330 
331  returns index for the molecule in the library
332  */
333  unsigned int addMol(const ROMol &mol);
334 
335  //! Get the matching indices for the query
336  /*!
337  \param query Query to match against molecules
338  \param recursionPossible flags whether or not recursive matches are allowed [ default true ]
339  \param useChirality use atomic CIP codes as part of the comparison [ default true ]
340  \param useQueryQueryMatches if set, the contents of atom and bond queries [ default false ]
341  will be used as part of the matching
342  \param numThreads If -1 use all available processors [default -1]
343  \param maxResults Maximum results to return, -1 means return all [default
344  -1]
345  */
346  std::vector<unsigned int> getMatches(const ROMol &query,
347  bool recursionPossible=true,
348  bool useChirality=true,
349  bool useQueryQueryMatches=false,
350  int numThreads=-1,
351  int maxResults=-1);
352  //!Get the matching indices for the query between the given indices
353  /*!
354  \param query Query to match against molecules
355  \param startIdx Start index of the search
356  \param endIdx Ending idx (non-inclusive) of the search.
357  \param recursionPossible flags whether or not recursive matches are allowed [ default true ]
358  \param useChirality use atomic CIP codes as part of the comparison [ default true ]
359  \param useQueryQueryMatches if set, the contents of atom and bond queries [ default false ]
360  will be used as part of the matching
361  \param numThreads If -1 use all available processors [default -1]
362  \param maxResults Maximum results to return, -1 means return all [default
363  -1]
364  */
365  std::vector<unsigned int> getMatches(const ROMol &query,
366  unsigned int startIdx, unsigned int endIdx,
367  bool recursionPossible=true,
368  bool useChirality=true,
369  bool useQueryQueryMatches=false,
370  int numThreads=-1,
371  int maxResults=-1);
372 
373  //! Return the number of matches for the query
374  /*!
375  \param query Query to match against molecules
376  \param recursionPossible flags whether or not recursive matches are allowed [ default true ]
377  \param useChirality use atomic CIP codes as part of the comparison [ default true ]
378  \param useQueryQueryMatches if set, the contents of atom and bond queries [ default false ]
379  will be used as part of the matching
380  \param numThreads If -1 use all available processors [default -1]
381  */
382  unsigned int countMatches(const ROMol &query,
383  bool recursionPossible=true,
384  bool useChirality=true,
385  bool useQueryQueryMatches=false,
386  int numThreads=-1);
387  //!Return the number of matches for the query between the given indices
388  /*!
389  \param query Query to match against molecules
390  \param startIdx Start index of the search
391  \param endIdx Ending idx (non-inclusive) of the search.
392  \param recursionPossible flags whether or not recursive matches are allowed [ default true ]
393  \param useChirality use atomic CIP codes as part of the comparison [ default true ]
394  \param useQueryQueryMatches if set, the contents of atom and bond queries [ default false ]
395  will be used as part of the matching
396  \param numThreads If -1 use all available processors [default -1]
397  */
398  unsigned int countMatches(const ROMol &query,
399  unsigned int startIdx, unsigned int endIdx,
400  bool recursionPossible=true,
401  bool useChirality=true,
402  bool useQueryQueryMatches=false,
403  int numThreads=-1);
404 
405  //! Returns true if any match exists for the query
406  /*!
407  \param query Query to match against molecules
408  \param recursionPossible flags whether or not recursive matches are allowed [ default true ]
409  \param useChirality use atomic CIP codes as part of the comparison [ default true ]
410  \param useQueryQueryMatches if set, the contents of atom and bond queries [ default false ]
411  will be used as part of the matching
412  \param numThreads If -1 use all available processors [default -1]
413  */
414  bool hasMatch(const ROMol &query,
415  bool recursionPossible=true,
416  bool useChirality=true,
417  bool useQueryQueryMatches=false,
418  int numThreads=-1);
419  //! Returns true if any match exists for the query between the specified indices
420  /*!
421  \param query Query to match against molecules
422  \param startIdx Start index of the search
423  \param endIdx Ending idx (inclusive) of the search.
424  \param recursionPossible flags whether or not recursive matches are allowed [ default true ]
425  \param useChirality use atomic CIP codes as part of the comparison [ default true ]
426  \param useQueryQueryMatches if set, the contents of atom and bond queries [ default false ]
427  will be used as part of the matching
428  \param numThreads If -1 use all available processors [default -1]
429  */
430  bool hasMatch(const ROMol &query,
431  unsigned int startIdx, unsigned int endIdx,
432  bool recursionPossible=true,
433  bool useChirality=true,
434  bool useQueryQueryMatches=false,
435  int numThreads=-1);
436 
437  //! Returns the molecule at the given index
438  /*!
439  \param idx Index of the molecule in the library
440  */
441  boost::shared_ptr<ROMol> getMol(unsigned int idx) const {
442  // expects implementation to throw IndexError if out of range
443  PRECONDITION(mols, "molholder is null in SubstructLibrary");
444  return mols->getMol(idx);
445  }
446 
447  //! Returns the molecule at the given index
448  /*!
449  \param idx Index of the molecule in the library
450  */
451  boost::shared_ptr<ROMol> operator[] (unsigned int idx) {
452  // expects implementation to throw IndexError if out of range
453  PRECONDITION(mols, "molholder is null in SubstructLibrary");
454  return mols->getMol(idx);
455  }
456 
457  //! return the number of molecules in the library
458  unsigned int size() const {
459  PRECONDITION(mols, "molholder is null in SubstructLibrary");
460  return rdcast<unsigned int>(molholder->size());
461  }
462 };
463 }
464 
465 #endif
virtual boost::shared_ptr< ROMol > getMol(unsigned int idx) const
SubstructLibrary(boost::shared_ptr< MolHolderBase > molecules, boost::shared_ptr< FPHolderBase > fingerprints)
virtual unsigned int size() const
Get the current library size.
static void pickleMol(const ROMol *mol, std::ostream &ss)
pickles a molecule and sends the results to stream ss
bool passesFilter(unsigned int idx, const ExplicitBitVect &query) const
Return false if a substructure search can never match the molecule.
MolHolderBase & getMolHolder()
Get the underlying molecule holder implementation.
RWMol is a molecule class that is intended to be edited.
Definition: RWMol.h:30
std::string MolToSmiles(const ROMol &mol, bool doIsomericSmiles=false, bool doKekule=false, int rootedAtAtom=-1, bool canonical=true, bool allBondsExplicit=false, bool allHsExplicit=false)
returns canonical SMILES for a molecule
Concrete class that holds molecules in memory.
virtual unsigned int addMol(const ROMol &m)
Concrete class that holds trusted smiles strings in memory.
pulls in the core RDKit functionality
ROMol is a molecule class that is intended to have a fixed topology.
Definition: ROMol.h:103
virtual boost::shared_ptr< ROMol > getMol(unsigned int idx) const
const FPHolderBase & getFingerprints() const
virtual unsigned int addMol(const ROMol &m)
unsigned int addMol(const ROMol &m)
Adds a molecule to the fingerprinter.
Base FPI for the fingerprinter used to rule out impossible matches.
virtual ExplicitBitVect * makeFingerprint(const ROMol &m) const
Caller owns the vector!
virtual boost::shared_ptr< ROMol > getMol(unsigned int) const =0
virtual unsigned int addMol(const ROMol &m)
void updatePropertyCache(bool strict=true)
calculates any of our lazy properties
RWMol * SmilesToMol(const std::string &smi, const SmilesParserParams &params)
ExplicitBitVect * PatternFingerprintMol(const ROMol &mol, unsigned int fpSize=2048, std::vector< unsigned int > *atomCounts=0, ExplicitBitVect *setOnlyBits=0)
Generates a topological fingerprint for a molecule using a series of pre-defined structural patterns...
Std stuff.
Definition: Atom.h:29
SubstructLibrary(boost::shared_ptr< MolHolderBase > molecules)
Class to allow us to throw an IndexError from C++ and have it make it back to Python.
Definition: Exceptions.h:18
Base class API for holding molecules so substructure search.
boost::shared_ptr< ROMol > getMol(unsigned int idx) const
Returns the molecule at the given index.
virtual unsigned int size() const
Get the current library size.
virtual unsigned int addMol(const ROMol &m)=0
unsigned int size() const
return the number of molecules in the library
unsigned int addSmiles(const std::string &smiles)
virtual boost::shared_ptr< ROMol > getMol(unsigned int idx) const
Contains general bit-comparison and similarity operations.
unsigned int addSmiles(const std::string &smiles)
virtual unsigned int size() const
Get the current library size.
static void molFromPickle(const std::string &pickle, ROMol *mol)
constructs a molecule from a pickle stored in a string
unsigned int addBinary(const std::string &pickle)
Concrete class that holds binary cached molecules in memory.
const ExplicitBitVect & getFingerprint(unsigned int idx) const
Get the bit vector at the specified index (throws IndexError if out of range)
#define PRECONDITION(expr, mess)
Definition: Invariant.h:107
const MolHolderBase & getMolecules() const
Uses the pattern fingerprinter to rule out matches.
virtual unsigned int addMol(const ROMol &m)
unsigned int addFingerprint(const ExplicitBitVect &v)
Adds a raw bit vector to the fingerprinter.
bool AllProbeBitsMatch(const char *probe, const char *ref)
Class to allow us to throw a ValueError from C++ and have it make it back to Python.
Definition: Exceptions.h:32
virtual boost::shared_ptr< ROMol > getMol(unsigned int idx) const
a class for bit vectors that are densely occupied
virtual unsigned int size() const =0
Get the current library size.
FPHolderBase & getFingerprints()
Get the underlying fingerprint implementation.
virtual unsigned int size() const
Get the current library size.
Concrete class that holds smiles strings in memory.
Substtructure Search a library of molecules.
void pickle(const boost::shared_ptr< EnumerationStrategyBase > &enumerator, std::ostream &ss)
pickles a EnumerationStrategy and adds the results to a stream ss