RDKit
Open-source cheminformatics and machine learning.
Loading...
Searching...
No Matches
SubstructLibrary.h
Go to the documentation of this file.
1// Copyright (c) 2017-2021, Novartis Institutes for BioMedical Research Inc.
2// and other RDKit contributors
3//
4// All rights reserved.
5//
6// Redistribution and use in source and binary forms, with or without
7// modification, are permitted provided that the following conditions are
8// met:
9//
10// * Redistributions of source code must retain the above copyright
11// notice, this list of conditions and the following disclaimer.
12// * Redistributions in binary form must reproduce the above
13// copyright notice, this list of conditions and the following
14// disclaimer in the documentation and/or other materials provided
15// with the distribution.
16// * Neither the name of Novartis Institutes for BioMedical Research Inc.
17// nor the names of its contributors may be used to endorse or promote
18// products derived from this software without specific prior written
19// permission.
20//
21// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32//
33#ifndef RDK_SUBSTRUCT_LIBRARY
34#define RDK_SUBSTRUCT_LIBRARY
35#include <utility>
36
37#include <RDGeneral/export.h>
38#include <GraphMol/RDKitBase.h>
39#include <GraphMol/MolPickler.h>
40#include <GraphMol/MolBundle.h>
46#include <DataStructs/BitOps.h>
47#include <GraphMol/MolOps.h>
50
51#include <algorithm>
52#include <string>
53#include <boost/lexical_cast.hpp>
54
55namespace RDKit {
56
57using GeneralizedSubstruct::ExtendedQueryMol;
58
60
61//! Base class API for holding molecules to substructure search.
62/*!
63 This is an API that hides the implementation details used for
64 indexing molecules for substructure searching. It simply
65 provides an API for adding and getting molecules from a set.
66 */
68 public:
69 virtual ~MolHolderBase() {}
70
71 //! Add a new molecule to the substructure search library
72 //! Returns the molecules index in the library
73 virtual unsigned int addMol(const ROMol &m) = 0;
74
75 // implementations should throw IndexError on out of range
76 virtual boost::shared_ptr<ROMol> getMol(unsigned int) const = 0;
77
78 //! Get the current library size
79 virtual unsigned int size() const = 0;
80};
81
82//! Concrete class that holds molecules in memory
83/*!
84 This is currently one of the faster implementations.
85 However it is very memory intensive.
86*/
88 std::vector<boost::shared_ptr<ROMol>> mols;
89
90 public:
91 MolHolder() : MolHolderBase(), mols() {}
92
93 unsigned int addMol(const ROMol &m) override {
94 mols.push_back(boost::make_shared<ROMol>(m));
95 return size() - 1;
96 }
97
98 boost::shared_ptr<ROMol> getMol(unsigned int idx) const override {
99 if (idx >= mols.size()) {
100 throw IndexErrorException(idx);
101 }
102 return mols[idx];
103 }
104
105 unsigned int size() const override {
106 return rdcast<unsigned int>(mols.size());
107 }
108
109 std::vector<boost::shared_ptr<ROMol>> &getMols() { return mols; }
110 const std::vector<boost::shared_ptr<ROMol>> &getMols() const { return mols; }
111};
112
113//! Concrete class that holds binary cached molecules in memory
114/*!
115 This implementation uses quite a bit less memory than the
116 non cached implementation. However, due to the reduced speed
117 it should be used in conjunction with a pattern fingerprinter.
118
119 See RDKit::FPHolder
120*/
122 std::vector<std::string> mols;
123
124 public:
126
127 unsigned int addMol(const ROMol &m) override {
128 mols.emplace_back();
129 MolPickler::pickleMol(m, mols.back());
130 return size() - 1;
131 }
132
133 //! Adds a pickled binary molecule, no validity checking of the input
134 //! is done.
135 unsigned int addBinary(const std::string &pickle) {
136 mols.push_back(pickle);
137 return size() - 1;
138 }
139
140 boost::shared_ptr<ROMol> getMol(unsigned int idx) const override {
141 if (idx >= mols.size()) {
142 throw IndexErrorException(idx);
143 }
144 boost::shared_ptr<ROMol> mol(new ROMol);
145 MolPickler::molFromPickle(mols[idx], mol.get());
146 return mol;
147 }
148
149 unsigned int size() const override {
150 return rdcast<unsigned int>(mols.size());
151 }
152
153 std::vector<std::string> &getMols() { return mols; }
154 const std::vector<std::string> &getMols() const { return mols; }
155};
156
157//! Concrete class that holds smiles strings in memory
158/*!
159 This implementation uses quite a bit less memory than the
160 cached binary or uncached implementation. However, due to the
161 reduced speed it should be used in conjunction with a pattern
162 fingerprinter.
163
164 See RDKit::FPHolder
165*/
167 : public MolHolderBase {
168 std::vector<std::string> mols;
169
170 public:
172
173 unsigned int addMol(const ROMol &m) override {
174 bool doIsomericSmiles = true;
175 mols.push_back(MolToSmiles(m, doIsomericSmiles));
176 return size() - 1;
177 }
178
179 //! Add a smiles to the dataset, no validation is done
180 //! to the inputs.
181 unsigned int addSmiles(const std::string &smiles) {
182 mols.push_back(smiles);
183 return size() - 1;
184 }
185
186 boost::shared_ptr<ROMol> getMol(unsigned int idx) const override {
187 if (idx >= mols.size()) {
188 throw IndexErrorException(idx);
189 }
190
191 boost::shared_ptr<ROMol> mol(SmilesToMol(mols[idx]));
192 return mol;
193 }
194
195 unsigned int size() const override {
196 return rdcast<unsigned int>(mols.size());
197 }
198
199 std::vector<std::string> &getMols() { return mols; }
200 const std::vector<std::string> &getMols() const { return mols; }
201};
202
203//! Concrete class that holds trusted smiles strings in memory
204/*!
205 A trusted smiles is essentially a smiles string that
206 RDKit has generated. This indicates that fewer
207 sanitization steps are required. See
208 http://rdkit.blogspot.com/2016/09/avoiding-unnecessary-work-and.html
209
210 This implementation uses quite a bit less memory than the
211 cached binary or uncached implementation. However, due to the
212 reduced speed it should be used in conjunction with a pattern
213 fingerprinter.
214
215 See RDKit::FPHolder
216*/
218 : public MolHolderBase {
219 std::vector<std::string> mols;
220
221 public:
223
224 unsigned int addMol(const ROMol &m) override {
225 bool doIsomericSmiles = true;
226 mols.push_back(MolToSmiles(m, doIsomericSmiles));
227 return size() - 1;
228 }
229
230 //! Add a smiles to the dataset, no validation is done
231 //! to the inputs.
232 unsigned int addSmiles(const std::string &smiles) {
233 mols.push_back(smiles);
234 return size() - 1;
235 }
236
237 boost::shared_ptr<ROMol> getMol(unsigned int idx) const override {
238 if (idx >= mols.size()) {
239 throw IndexErrorException(idx);
240 }
241
242 RWMol *m = SmilesToMol(mols[idx], 0, false);
243 if (m) {
244 m->updatePropertyCache();
245 }
246 return boost::shared_ptr<ROMol>(m);
247 }
248
249 unsigned int size() const override {
250 return rdcast<unsigned int>(mols.size());
251 }
252
253 std::vector<std::string> &getMols() { return mols; }
254 const std::vector<std::string> &getMols() const { return mols; }
255};
256
257//! Base FPI for the fingerprinter used to rule out impossible matches
259 std::vector<ExplicitBitVect *> fps;
260
261 public:
262 virtual ~FPHolderBase() {
263 for (size_t i = 0; i < fps.size(); ++i) {
264 delete fps[i];
265 }
266 }
267
268 virtual unsigned int size() const { return rdcast<unsigned int>(fps.size()); }
269
270 //! Adds a molecule to the fingerprinter
271 unsigned int addMol(const ROMol &m) {
272 fps.push_back(makeFingerprint(m));
273 return rdcast<unsigned int>(fps.size() - 1);
274 }
275
276 //! Adds a raw bit vector pointer to the fingerprinter, which takes ownership
277 //! PLEASE NOTE: make sure that the passed ExplicitBitVect
278 //! is compatible with the one generated by makeFingerprint()
280 fps.push_back(v);
281 return rdcast<unsigned int>(fps.size() - 1);
282 }
283
284 //! Adds a raw bit vector to the fingerprinter
285 //! PLEASE NOTE: make sure that the passed ExplicitBitVect
286 //! is compatible with the one generated by makeFingerprint()
287 unsigned int addFingerprint(const ExplicitBitVect &v) {
288 return addFingerprint(new ExplicitBitVect(v));
289 }
290
291 //! Return false if a substructure search can never match the molecule
292 bool passesFilter(unsigned int idx, const ExplicitBitVect &query) const {
293 if (idx >= fps.size()) {
294 throw IndexErrorException(idx);
295 }
296
297 return AllProbeBitsMatch(query, *fps[idx]);
298 }
299
300 //! Get the bit vector at the specified index (throws IndexError if out of
301 //! range)
302 const ExplicitBitVect &getFingerprint(unsigned int idx) const {
303 if (idx >= fps.size()) {
304 throw IndexErrorException(idx);
305 }
306 return *fps[idx];
307 }
308
309 //! make the query vector
310 //! Caller owns the vector!
311 virtual ExplicitBitVect *makeFingerprint(const ROMol &m) const = 0;
312
313 std::vector<ExplicitBitVect *> &getFingerprints() { return fps; }
314 const std::vector<ExplicitBitVect *> &getFingerprints() const { return fps; }
315};
316
317//! Uses the pattern fingerprinter with a user-defined number of bits (default:
318//! 2048) to rule out matches
320 unsigned int numBits;
321
322 public:
323 PatternHolder() : FPHolderBase(), numBits(defaultNumBits()) {}
324 PatternHolder(unsigned int numBits) : FPHolderBase(), numBits(numBits) {}
325 //! Caller owns the vector!
326 ExplicitBitVect *makeFingerprint(const ROMol &m) const override {
327 return PatternFingerprintMol(m, numBits);
328 }
329 const unsigned int &getNumBits() const { return numBits; };
330 unsigned int &getNumBits() { return numBits; };
331 static unsigned int defaultNumBits() {
332 static const unsigned int DEFAULT_NUM_BITS = 2048;
333 return DEFAULT_NUM_BITS;
334 };
335};
336
338 : public PatternHolder {
339 public:
341 TautomerPatternHolder(unsigned int numBits) : PatternHolder(numBits) {}
342 ExplicitBitVect *makeFingerprint(const ROMol &m) const override {
343 std::vector<unsigned int> *atomCounts = nullptr;
344 ExplicitBitVect *setOnlyBits = nullptr;
345 const bool tautomericFingerprint = true;
346 return PatternFingerprintMol(m, getNumBits(), atomCounts, setOnlyBits,
347 tautomericFingerprint);
348 }
349};
350
352 public:
353 virtual ~KeyHolderBase() {}
354
355 //! Add a key to the database getting it from the molecule
356 virtual unsigned int addMol(const ROMol &m) = 0;
357
358 //! Add a key to the database, this needs to be in the same order
359 //! as the molecule, no validation is done
360 virtual unsigned int addKey(const std::string &) = 0;
361
362 // !get the key at the requested index
363 // implementations should throw IndexError on out of range
364 virtual const std::string &getKey(unsigned int) const = 0;
365
366 // !get keys from a bunch of indices
367 virtual std::vector<std::string> getKeys(
368 const std::vector<unsigned int> &indices) const = 0;
369 //! Get the current keeyholder size
370 virtual unsigned int size() const = 0;
371};
372
374 std::string propname;
375 std::vector<std::string> keys;
376 const std::string empty_string = {};
377
378 public:
379 KeyFromPropHolder(const std::string &propname = "_Name")
380 : propname(propname) {}
381
382 std::string &getPropName() { return propname; }
383 const std::string &getPropName() const { return propname; }
384
385 std::vector<std::string> &getKeys() { return keys; }
386 const std::vector<std::string> &getKeys() const { return keys; }
387
388 unsigned int addMol(const ROMol &m) override {
389 std::string key;
390 if (m.getPropIfPresent(propname, key)) {
391 keys.push_back(std::move(key));
392 } else {
393 // XXX is this a warning? it could be verbose. Should we push back the
394 // string repr of the
395 // numeric index?
396 const static std::string prefix("LIBIDX-");
397 keys.emplace_back(prefix + boost::lexical_cast<std::string>(keys.size()));
398 }
399 return keys.size() - 1u;
400 };
401
402 unsigned int addKey(const std::string &key) override {
403 keys.push_back(key);
404 return keys.size() - 1u;
405 }
406
407 const std::string &getKey(unsigned int idx) const override {
408 if (idx >= keys.size()) {
409 throw IndexErrorException(idx);
410 }
411 return keys[idx];
412 }
413
414 std::vector<std::string> getKeys(
415 const std::vector<unsigned int> &indices) const override {
416 std::vector<std::string> res;
417 std::transform(indices.begin(), indices.end(), std::back_inserter(res),
418 [=](unsigned idx) { return keys.at(idx); });
419 return res;
420 }
421 unsigned int size() const override { return keys.size(); }
422};
423
424//! Substructure Search a library of molecules
425/*! This class allows for multithreaded substructure searches of
426 large datasets.
427
428 The implementations can use fingerprints to speed up searches
429 and have molecules cached as binary forms to reduce memory
430 usage.
431
432 basic usage:
433 \code
434 SubstructLibrary lib;
435 lib.addMol(mol);
436 std::vector<unsigned int> results = lib.getMatches(query);
437 for(std::vector<unsigned int>::const_iterator matchIndex=results.begin();
438 matchIndex != results.end();
439 ++matchIndex) {
440 boost::shared_ptr<ROMol> match = lib.getMol(*matchIndex);
441 }
442 \endcode
443
444 Using different mol holders and pattern fingerprints.
445
446 \code
447 boost::shared_ptr<CachedTrustedSmilesMolHolder> molHolder = \
448 boost::make_shared<CachedTrustedSmilesMolHolder>();
449 boost::shared_ptr<PatternHolder> patternHolder = \
450 boost::make_shared<PatternHolder>();
451
452 SubstructLibrary lib(molHolder, patternHolder);
453 lib.addMol(mol);
454 \endcode
455
456 Cached molecule holders create molecules on demand. There are currently
457 three styles of cached molecules.
458
459 CachedMolHolder: stores molecules in the rdkit binary format.
460 CachedSmilesMolHolder: stores molecules in smiles format.
461 CachedTrustedSmilesMolHolder: stores molecules in smiles format.
462
463 The CachedTrustedSmilesMolHolder is made to add molecules from
464 a trusted source. This makes the basic assumption that RDKit was
465 used to sanitize and canonicalize the smiles string. In practice
466 this is considerably faster than using arbitrary smiles strings since
467 certain assumptions can be made. Molecules generated from trusted
468 smiles do not have ring information (although this is created
469 in the molecule being searched if necessary).
470
471 When loading from external data, as opposed to using the "addMol" API,
472 care must be taken to ensure that the pattern fingerprints and smiles
473 are synchronized.
474
475 Each pattern holder has an API point for making its fingerprint. This
476 is useful to ensure that the pattern stored in the database will be
477 compatible with the patterns made when analyzing queries.
478
479 \code
480 boost::shared_ptr<CachedTrustedSmilesMolHolder> molHolder = \
481 boost::make_shared<CachedTrustedSmilesMolHolder>();
482 boost::shared_ptr<PatternHolder> patternHolder = \
483 boost::make_shared<PatternHolder>();
484
485 // the PatternHolder instance is able to make fingerprints.
486 // These, of course, can be read from a file. For demonstration
487 // purposes we construct them here.
488 const std::string trustedSmiles = "c1ccccc1";
489 ROMol *m = SmilesToMol(trustedSmiles);
490 const ExplicitBitVect *bitVector = patternHolder->makeFingerprint(*m);
491
492 // The trusted smiles and bitVector can be read from any source.
493 // This is the fastest way to load a substruct library.
494 molHolder->addSmiles( trustedSmiles );
495 patternHolder->addFingerprint( *bitVector );
496 SubstructLibrary lib(molHolder, patternHolder);
497 delete m;
498 delete bitVector;
499 \endcode
500
501 Finally, using the KeyFromPropHolder will store user ids or keys.
502 By default, it uses RDKit's default _Name prop, but can be changed
503 to any property.
504
505 \code
506 boost::shared_ptr<CachedTrustedSmilesMolHolder> molHolder = \
507 boost::make_shared<CachedTrustedSmilesMolHolder>();
508 boost::shared_ptr<KeyFromPropHolder> keyHolder = \
509 boost::make_shared<KeyFromPropHolder>();
510 SubstructLibrary lib(molHolder, keyHolder);
511 ...
512
513 You can get the keys in multiple through the use of the keyholder
514 auto key = lib.getKeys().getKey(idx);
515 auto keys = lib.getKeys().getKeys(lib.GetMatch(query));
516 \endcode
517
518*/
520 boost::shared_ptr<MolHolderBase> molholder;
521 boost::shared_ptr<FPHolderBase> fpholder;
522 boost::shared_ptr<KeyHolderBase> keyholder;
523
524 MolHolderBase *mols; // used for a small optimization
525 FPHolderBase *fps{nullptr};
526 bool is_tautomerquery = false;
527 std::vector<unsigned int> searchOrder;
528
529 public:
531 : molholder(new MolHolder),
532 fpholder(),
533 keyholder(),
534 mols(molholder.get()) {}
535
536 SubstructLibrary(boost::shared_ptr<MolHolderBase> molecules)
537 : molholder(std::move(molecules)),
538 fpholder(),
539 keyholder(),
540 mols(molholder.get()),
541 fps(nullptr) {}
542
543 SubstructLibrary(boost::shared_ptr<MolHolderBase> molecules,
544 boost::shared_ptr<FPHolderBase> fingerprints)
545 : molholder(std::move(molecules)),
546 fpholder(std::move(fingerprints)),
547 keyholder(),
548 mols(molholder.get()),
549 fps(fpholder.get()) {
550 if (fpholder.get() &&
551 dynamic_cast<TautomerPatternHolder *>(fpholder.get()) != nullptr) {
552 is_tautomerquery = true;
553 }
554 }
555
556 SubstructLibrary(boost::shared_ptr<MolHolderBase> molecules,
557 boost::shared_ptr<KeyHolderBase> keys)
558 : molholder(std::move(molecules)),
559 fpholder(),
560 keyholder(std::move(keys)),
561 mols(molholder.get()),
562 fps(nullptr) {
563 if (fpholder.get() &&
564 dynamic_cast<TautomerPatternHolder *>(fpholder.get()) != nullptr) {
565 is_tautomerquery = true;
566 }
567 }
568
569 SubstructLibrary(boost::shared_ptr<MolHolderBase> molecules,
570 boost::shared_ptr<FPHolderBase> fingerprints,
571 boost::shared_ptr<KeyHolderBase> keys)
572 : molholder(std::move(molecules)),
573 fpholder(std::move(fingerprints)),
574 keyholder(std::move(keys)),
575 mols(molholder.get()),
576 fps(fpholder.get()) {
577 if (fpholder.get() &&
578 dynamic_cast<TautomerPatternHolder *>(fpholder.get()) != nullptr) {
579 is_tautomerquery = true;
580 }
581 }
582
583 SubstructLibrary(const std::string &pickle)
584 : molholder(new MolHolder),
585 fpholder(),
586 mols(molholder.get()),
587 fps(nullptr) {
588 initFromString(pickle);
589 if (fpholder.get() &&
590 dynamic_cast<TautomerPatternHolder *>(fpholder.get()) != nullptr) {
591 is_tautomerquery = true;
592 }
593 }
594
595 //! Get the underlying molecule holder implementation
596 boost::shared_ptr<MolHolderBase> &getMolHolder() { return molholder; }
597
598 const boost::shared_ptr<MolHolderBase> &getMolHolder() const {
599 return molholder;
600 }
601
602 //! Get the underlying molecule holder implementation
603 boost::shared_ptr<FPHolderBase> &getFpHolder() { return fpholder; }
604
605 //! Get the underlying molecule holder implementation
606 const boost::shared_ptr<FPHolderBase> &getFpHolder() const {
607 return fpholder;
608 }
609
610 //! Get the underlying molecule holder implementation
611 boost::shared_ptr<KeyHolderBase> &getKeyHolder() { return keyholder; }
612
613 //! Get the underlying molecule holder implementation
614 const boost::shared_ptr<KeyHolderBase> &getKeyHolder() const {
615 return keyholder;
616 }
617
619 PRECONDITION(mols, "Molecule holder NULL in SubstructLibrary");
620 return *mols;
621 }
622
623 //! Get the underlying fingerprint implementation.
624 /*! Throws a value error if no fingerprints have been set */
626 if (!fps) {
627 throw ValueErrorException("Substruct Library does not have fingerprints");
628 }
629 return *fps;
630 }
631
633 if (!fps) {
634 throw ValueErrorException("Substruct Library does not have fingerprints");
635 }
636 return *fps;
637 }
638
639 //! Get the underlying key holder implementation.
640 /*! Throws a value error if no keyholder have been set */
642 if (!keyholder.get()) {
643 throw ValueErrorException("Substruct Library does not have fingerprints");
644 }
645 return *keyholder.get();
646 }
647
648 //! Get the underlying key holder implementation.
649 /*! Throws a value error if no keyholder have been set */
650 const KeyHolderBase &getKeys() const {
651 if (!keyholder.get()) {
652 throw ValueErrorException("Substruct Library does not have fingerprints");
653 }
654 return *keyholder.get();
655 }
656
657 //! Add a molecule to the library
658 /*!
659 \param mol Molecule to add
660
661 returns index for the molecule in the library
662 */
663 unsigned int addMol(const ROMol &mol);
664
665 //! Get the matching indices for the query
666 /*!
667 \param query Query or Tautomer Query to match against molecules
668 \param recursionPossible flags whether or not recursive matches are allowed
669 [default true]
670 \param useChirality use atomic CIP codes as part of the comparison
671 [default true]
672 \param useQueryQueryMatches if set, the contents of atom and bond queries
673 will be used as part of the matching
674 [default false]
675 \param numThreads If -1 use all available processors [default -1]
676 \param maxResults Maximum results to return, -1 means return all
677 [default -1]
678 */
679 template <class Query>
680 std::vector<unsigned int> getMatches(const Query &query,
681 bool recursionPossible = true,
682 bool useChirality = true,
683 bool useQueryQueryMatches = false,
684 int numThreads = -1,
685 int maxResults = -1) const {
687 params.recursionPossible = recursionPossible;
688 params.useChirality = useChirality;
689 params.useQueryQueryMatches = useQueryQueryMatches;
690 return getMatches(query, 0, size(), params, numThreads, maxResults);
691 }
692 //! overload
693 template <class Query>
694 std::vector<unsigned int> getMatches(const Query &query,
695 const SubstructMatchParameters &params,
696 int numThreads = -1,
697 int maxResults = -1) const {
698 return getMatches(query, 0, size(), params, numThreads, maxResults);
699 }
700 //! Get the matching indices for the query between the given indices
701 /*!
702 \param query Query to match against molecules
703 \param startIdx Start index of the search
704 \param endIdx Ending idx (non-inclusive) of the search.
705 \param recursionPossible flags whether or not recursive matches are allowed
706 [default true]
707 \param useChirality use atomic CIP codes as part of the comparison
708 [default true]
709 \param useQueryQueryMatches if set, the contents of atom and bond queries
710 will be used as part of the matching
711 [default false]
712 \param numThreads If -1 use all available processors [default -1]
713 \param maxResults Maximum results to return, -1 means return all
714 [default -1]
715 */
716 template <class Query>
717 std::vector<unsigned int> getMatches(
718 const Query &query, unsigned int startIdx, unsigned int endIdx,
719 bool recursionPossible = true, bool useChirality = true,
720 bool useQueryQueryMatches = false, int numThreads = -1,
721 int maxResults = -1) const {
723 params.recursionPossible = recursionPossible;
724 params.useChirality = useChirality;
725 params.useQueryQueryMatches = useQueryQueryMatches;
726 return getMatches(query, startIdx, endIdx, params, numThreads, maxResults);
727 };
728 //! overload
729 std::vector<unsigned int> getMatches(const ROMol &query,
730 unsigned int startIdx,
731 unsigned int endIdx,
732 const SubstructMatchParameters &params,
733 int numThreads = -1,
734 int maxResults = -1) const;
735 //! overload
736 std::vector<unsigned int> getMatches(const MolBundle &query,
737 unsigned int startIdx,
738 unsigned int endIdx,
739 const SubstructMatchParameters &params,
740 int numThreads = -1,
741 int maxResults = -1) const;
742 //! overload
743 std::vector<unsigned int> getMatches(const TautomerQuery &query,
744 unsigned int startIdx,
745 unsigned int endIdx,
746 const SubstructMatchParameters &params,
747 int numThreads = -1,
748 int maxResults = -1) const;
749 //! overload
750 std::vector<unsigned int> getMatches(const ExtendedQueryMol &query,
751 unsigned int startIdx,
752 unsigned int endIdx,
753 const SubstructMatchParameters &params,
754 int numThreads = -1,
755 int maxResults = -1) const;
756
757 //! Return the number of matches for the query
758 /*!
759 \param query Molecule or Tautomer Query to match against molecules
760 \param recursionPossible flags whether or not recursive matches are allowed
761 [default true]
762 \param useChirality use atomic CIP codes as part of the comparison
763 [default true]
764 \param useQueryQueryMatches if set, the contents of atom and bond queries
765 will be used as part of the matching
766 [default false]
767 \param numThreads If -1 use all available processors [default -1]
768 */
769 template <class Query>
770 unsigned int countMatches(const Query &query, bool recursionPossible = true,
771 bool useChirality = true,
772 bool useQueryQueryMatches = false,
773 int numThreads = -1) const {
775 params.recursionPossible = recursionPossible;
776 params.useChirality = useChirality;
777 params.useQueryQueryMatches = useQueryQueryMatches;
778 return countMatches(query, 0, size(), params, numThreads);
779 }
780 //! overload
781 template <class Query>
782 unsigned int countMatches(const Query &query,
783 const SubstructMatchParameters &params,
784 int numThreads = -1) const {
785 return countMatches(query, 0, size(), params, numThreads);
786 }
787
788 //! Return the number of matches for the query
789
790 //! Return the number of matches for the query between the given indices
791 /*!
792 \param query Query to match against molecules
793 \param startIdx Start index of the search
794 \param endIdx Ending idx (non-inclusive) of the search.
795 \param recursionPossible flags whether or not recursive matches are allowed
796 [default true]
797 \param useChirality use atomic CIP codes as part of the comparison
798 [default true]
799 \param useQueryQueryMatches if set, the contents of atom and bond queries
800 will be used as part of the matching
801 [default false]
802 \param numThreads If -1 use all available processors [default -1]
803 */
804 template <class Query>
805 unsigned int countMatches(const Query &query, unsigned int startIdx,
806 unsigned int endIdx, bool recursionPossible = true,
807 bool useChirality = true,
808 bool useQueryQueryMatches = false,
809 int numThreads = -1) const {
811 params.recursionPossible = recursionPossible;
812 params.useChirality = useChirality;
813 params.useQueryQueryMatches = useQueryQueryMatches;
814 return countMatches(query, startIdx, endIdx, params, numThreads);
815 };
816
817 //! overload
818 unsigned int countMatches(const ROMol &query, unsigned int startIdx,
819 unsigned int endIdx,
820 const SubstructMatchParameters &params,
821 int numThreads = -1) const;
822 //! overload
823 unsigned int countMatches(const TautomerQuery &query, unsigned int startIdx,
824 unsigned int endIdx,
825 const SubstructMatchParameters &params,
826 int numThreads = -1) const;
827 //! overload
828 unsigned int countMatches(const MolBundle &query, unsigned int startIdx,
829 unsigned int endIdx,
830 const SubstructMatchParameters &params,
831 int numThreads = -1) const;
832 //! overload
833 unsigned int countMatches(const ExtendedQueryMol &query,
834 unsigned int startIdx, unsigned int endIdx,
835 const SubstructMatchParameters &params,
836 int numThreads = -1) const;
837
838 //! Returns true if any match exists for the query
839 /*!
840 \param query Molecule or Tautomer Query to match against molecules
841 \param recursionPossible flags whether or not recursive matches are allowed
842 [default true]
843 \param useChirality use atomic CIP codes as part of the comparison
844 [default true]
845 \param useQueryQueryMatches if set, the contents of atom and bond queries
846 will be used as part of the matching
847 [default false]
848 \param numThreads If -1 use all available processors [default -1]
849 */
850 template <class Query>
851 bool hasMatch(const Query &query, bool recursionPossible = true,
852 bool useChirality = true, bool useQueryQueryMatches = false,
853 int numThreads = -1) const {
855 params.recursionPossible = recursionPossible;
856 params.useChirality = useChirality;
857 params.useQueryQueryMatches = useQueryQueryMatches;
858 return hasMatch(query, 0, size(), params, numThreads);
859 }
860 //! overload
861 template <class Query>
862 bool hasMatch(const Query &query, const SubstructMatchParameters &params,
863 int numThreads = -1) const {
864 return hasMatch(query, 0, size(), params, numThreads);
865 }
866 //! Returns true if any match exists for the query between the specified
867 //! indices
868 /*!
869 \param query Query to match against molecules
870 \param startIdx Start index of the search
871 \param endIdx Ending idx (inclusive) of the search.
872 \param recursionPossible flags whether or not recursive matches are
873 allowed [default true] \param useChirality use atomic CIP codes as part
874 of the comparison [default true] \param useQueryQueryMatches if set, the
875 contents of atom and bond queries will be used as part of the matching
876 [default false]
877 \param numThreads If -1 use all available processors [default -1]
878 */
879 template <class Query>
880 bool hasMatch(const Query &query, unsigned int startIdx, unsigned int endIdx,
881 bool recursionPossible = true, bool useChirality = true,
882 bool useQueryQueryMatches = false, int numThreads = -1) const {
884 params.recursionPossible = recursionPossible;
885 params.useChirality = useChirality;
886 params.useQueryQueryMatches = useQueryQueryMatches;
887 return hasMatch(query, startIdx, endIdx, params, numThreads);
888 };
889 //! overload
890 bool hasMatch(const ROMol &query, unsigned int startIdx, unsigned int endIdx,
891 const SubstructMatchParameters &params,
892 int numThreads = -1) const;
893 //! overload
894 bool hasMatch(const TautomerQuery &query, unsigned int startIdx,
895 unsigned int endIdx, const SubstructMatchParameters &params,
896 int numThreads = -1) const;
897 //! overload
898 bool hasMatch(const MolBundle &query, unsigned int startIdx,
899 unsigned int endIdx, const SubstructMatchParameters &params,
900 int numThreads = -1) const;
901 //! overload
902 bool hasMatch(const ExtendedQueryMol &query, unsigned int startIdx,
903 unsigned int endIdx, const SubstructMatchParameters &params,
904 int numThreads = -1) const;
905 //! Returns the molecule at the given index
906 /*!
907 \param idx Index of the molecule in the library (n.b. could contain
908 null)
909 */
910 boost::shared_ptr<ROMol> getMol(unsigned int idx) const {
911 // expects implementation to throw IndexError if out of range
912 PRECONDITION(mols, "molholder is null in SubstructLibrary");
913 return mols->getMol(idx);
914 }
915
916 //! Returns the molecule at the given index
917 /*!
918 \param idx Index of the molecule in the library (n.b. could contain
919 null)
920 */
921 boost::shared_ptr<ROMol> operator[](unsigned int idx) {
922 // expects implementation to throw IndexError if out of range
923 PRECONDITION(mols, "molholder is null in SubstructLibrary");
924 return mols->getMol(idx);
925 }
926
927 //! return the number of molecules in the library
928 unsigned int size() const {
929 PRECONDITION(mols, "molholder is null in SubstructLibrary");
930 return rdcast<unsigned int>(molholder->size());
931 }
932
933 //! does error checking
934 void setSearchOrder(const std::vector<unsigned int> &order) {
935 for (const auto idx : order) {
936 if (idx >= mols->size()) {
937 throw IndexErrorException(idx);
938 }
939 }
940 searchOrder = order;
941 }
942
943 const std::vector<unsigned int> &getSearchOrder() const {
944 return searchOrder;
945 }
946
947 std::vector<unsigned int> &getSearchOrder() { return searchOrder; }
948 //! access required for serialization
950 is_tautomerquery = false;
951 mols = molholder.get();
952 fps = fpholder.get();
953 if (fps && dynamic_cast<TautomerPatternHolder *>(fps) != nullptr) {
954 is_tautomerquery = true;
955 }
956 }
957
958 //! serializes (pickles) to a stream
959 void toStream(std::ostream &ss) const;
960 //! returns a string with a serialized (pickled) representation
961 std::string Serialize() const;
962 //! initializes from a stream pickle
963 void initFromStream(std::istream &ss);
964 //! initializes from a string pickle
965 void initFromString(const std::string &text);
966};
967} // namespace RDKit
968
970#endif
Contains general bit-comparison and similarity operations.
RDKIT_DATASTRUCTS_EXPORT bool AllProbeBitsMatch(const char *probe, const char *ref)
#define PRECONDITION(expr, mess)
Definition Invariant.h:109
Defines a class for managing bundles of molecules.
pulls in the core RDKit functionality
a class for bit vectors that are densely occupied
Class to allow us to throw an IndexError from C++ and have it make it back to Python.
Definition Exceptions.h:20
Concrete class that holds binary cached molecules in memory.
std::vector< std::string > & getMols()
unsigned int size() const override
Get the current library size.
unsigned int addMol(const ROMol &m) override
boost::shared_ptr< ROMol > getMol(unsigned int idx) const override
const std::vector< std::string > & getMols() const
unsigned int addBinary(const std::string &pickle)
Concrete class that holds smiles strings in memory.
std::vector< std::string > & getMols()
unsigned int addSmiles(const std::string &smiles)
const std::vector< std::string > & getMols() const
unsigned int addMol(const ROMol &m) override
boost::shared_ptr< ROMol > getMol(unsigned int idx) const override
unsigned int size() const override
Get the current library size.
Concrete class that holds trusted smiles strings in memory.
std::vector< std::string > & getMols()
unsigned int addSmiles(const std::string &smiles)
unsigned int addMol(const ROMol &m) override
const std::vector< std::string > & getMols() const
boost::shared_ptr< ROMol > getMol(unsigned int idx) const override
unsigned int size() const override
Get the current library size.
Base FPI for the fingerprinter used to rule out impossible matches.
std::vector< ExplicitBitVect * > & getFingerprints()
unsigned int addMol(const ROMol &m)
Adds a molecule to the fingerprinter.
virtual unsigned int size() const
const std::vector< ExplicitBitVect * > & getFingerprints() const
bool passesFilter(unsigned int idx, const ExplicitBitVect &query) const
Return false if a substructure search can never match the molecule.
unsigned int addFingerprint(ExplicitBitVect *v)
const ExplicitBitVect & getFingerprint(unsigned int idx) const
unsigned int addFingerprint(const ExplicitBitVect &v)
virtual ExplicitBitVect * makeFingerprint(const ROMol &m) const =0
KeyFromPropHolder(const std::string &propname="_Name")
const std::string & getKey(unsigned int idx) const override
unsigned int addKey(const std::string &key) override
unsigned int size() const override
Get the current keeyholder size.
const std::vector< std::string > & getKeys() const
std::vector< std::string > & getKeys()
std::vector< std::string > getKeys(const std::vector< unsigned int > &indices) const override
const std::string & getPropName() const
unsigned int addMol(const ROMol &m) override
Add a key to the database getting it from the molecule.
virtual std::vector< std::string > getKeys(const std::vector< unsigned int > &indices) const =0
virtual const std::string & getKey(unsigned int) const =0
virtual unsigned int addMol(const ROMol &m)=0
Add a key to the database getting it from the molecule.
virtual unsigned int size() const =0
Get the current keeyholder size.
virtual unsigned int addKey(const std::string &)=0
MolBundle contains a collection of related ROMols.
Definition MolBundle.h:59
Base class API for holding molecules to substructure search.
virtual unsigned int addMol(const ROMol &m)=0
virtual unsigned int size() const =0
Get the current library size.
virtual boost::shared_ptr< ROMol > getMol(unsigned int) const =0
Concrete class that holds molecules in memory.
unsigned int addMol(const ROMol &m) override
const std::vector< boost::shared_ptr< ROMol > > & getMols() const
unsigned int size() const override
Get the current library size.
boost::shared_ptr< ROMol > getMol(unsigned int idx) const override
std::vector< boost::shared_ptr< ROMol > > & getMols()
PatternHolder(unsigned int numBits)
const unsigned int & getNumBits() const
unsigned int & getNumBits()
static unsigned int defaultNumBits()
ExplicitBitVect * makeFingerprint(const ROMol &m) const override
Caller owns the vector!
RWMol is a molecule class that is intended to be edited.
Definition RWMol.h:32
Substructure Search a library of molecules.
unsigned int countMatches(const Query &query, bool recursionPossible=true, bool useChirality=true, bool useQueryQueryMatches=false, int numThreads=-1) const
Return the number of matches for the query.
unsigned int countMatches(const ExtendedQueryMol &query, unsigned int startIdx, unsigned int endIdx, const SubstructMatchParameters &params, int numThreads=-1) const
overload
unsigned int addMol(const ROMol &mol)
Add a molecule to the library.
std::vector< unsigned int > getMatches(const ROMol &query, unsigned int startIdx, unsigned int endIdx, const SubstructMatchParameters &params, int numThreads=-1, int maxResults=-1) const
overload
void initFromStream(std::istream &ss)
initializes from a stream pickle
KeyHolderBase & getKeys()
Get the underlying key holder implementation.
std::vector< unsigned int > getMatches(const ExtendedQueryMol &query, unsigned int startIdx, unsigned int endIdx, const SubstructMatchParameters &params, int numThreads=-1, int maxResults=-1) const
overload
boost::shared_ptr< ROMol > getMol(unsigned int idx) const
Returns the molecule at the given index.
boost::shared_ptr< MolHolderBase > & getMolHolder()
Get the underlying molecule holder implementation.
bool hasMatch(const Query &query, bool recursionPossible=true, bool useChirality=true, bool useQueryQueryMatches=false, int numThreads=-1) const
Returns true if any match exists for the query.
SubstructLibrary(boost::shared_ptr< MolHolderBase > molecules, boost::shared_ptr< FPHolderBase > fingerprints, boost::shared_ptr< KeyHolderBase > keys)
unsigned int countMatches(const Query &query, unsigned int startIdx, unsigned int endIdx, bool recursionPossible=true, bool useChirality=true, bool useQueryQueryMatches=false, int numThreads=-1) const
Return the number of matches for the query.
std::vector< unsigned int > getMatches(const MolBundle &query, unsigned int startIdx, unsigned int endIdx, const SubstructMatchParameters &params, int numThreads=-1, int maxResults=-1) const
overload
std::vector< unsigned int > getMatches(const Query &query, const SubstructMatchParameters &params, int numThreads=-1, int maxResults=-1) const
overload
boost::shared_ptr< FPHolderBase > & getFpHolder()
Get the underlying molecule holder implementation.
std::vector< unsigned int > getMatches(const Query &query, bool recursionPossible=true, bool useChirality=true, bool useQueryQueryMatches=false, int numThreads=-1, int maxResults=-1) const
Get the matching indices for the query.
const MolHolderBase & getMolecules() const
void initFromString(const std::string &text)
initializes from a string pickle
unsigned int countMatches(const Query &query, const SubstructMatchParameters &params, int numThreads=-1) const
overload
const KeyHolderBase & getKeys() const
Get the underlying key holder implementation.
const boost::shared_ptr< KeyHolderBase > & getKeyHolder() const
Get the underlying molecule holder implementation.
bool hasMatch(const ExtendedQueryMol &query, unsigned int startIdx, unsigned int endIdx, const SubstructMatchParameters &params, int numThreads=-1) const
overload
bool hasMatch(const ROMol &query, unsigned int startIdx, unsigned int endIdx, const SubstructMatchParameters &params, int numThreads=-1) const
overload
unsigned int countMatches(const MolBundle &query, unsigned int startIdx, unsigned int endIdx, const SubstructMatchParameters &params, int numThreads=-1) const
overload
FPHolderBase & getFingerprints()
Get the underlying fingerprint implementation.
void setSearchOrder(const std::vector< unsigned int > &order)
does error checking
bool hasMatch(const Query &query, unsigned int startIdx, unsigned int endIdx, bool recursionPossible=true, bool useChirality=true, bool useQueryQueryMatches=false, int numThreads=-1) const
const FPHolderBase & getFingerprints() const
SubstructLibrary(boost::shared_ptr< MolHolderBase > molecules, boost::shared_ptr< KeyHolderBase > keys)
bool hasMatch(const MolBundle &query, unsigned int startIdx, unsigned int endIdx, const SubstructMatchParameters &params, int numThreads=-1) const
overload
unsigned int countMatches(const ROMol &query, unsigned int startIdx, unsigned int endIdx, const SubstructMatchParameters &params, int numThreads=-1) const
overload
bool hasMatch(const Query &query, const SubstructMatchParameters &params, int numThreads=-1) const
overload
SubstructLibrary(boost::shared_ptr< MolHolderBase > molecules, boost::shared_ptr< FPHolderBase > fingerprints)
boost::shared_ptr< KeyHolderBase > & getKeyHolder()
Get the underlying molecule holder implementation.
bool hasMatch(const TautomerQuery &query, unsigned int startIdx, unsigned int endIdx, const SubstructMatchParameters &params, int numThreads=-1) const
overload
const std::vector< unsigned int > & getSearchOrder() const
void resetHolders()
access required for serialization
unsigned int size() const
return the number of molecules in the library
std::vector< unsigned int > getMatches(const Query &query, unsigned int startIdx, unsigned int endIdx, bool recursionPossible=true, bool useChirality=true, bool useQueryQueryMatches=false, int numThreads=-1, int maxResults=-1) const
Get the matching indices for the query between the given indices.
SubstructLibrary(boost::shared_ptr< MolHolderBase > molecules)
SubstructLibrary(const std::string &pickle)
std::string Serialize() const
returns a string with a serialized (pickled) representation
unsigned int countMatches(const TautomerQuery &query, unsigned int startIdx, unsigned int endIdx, const SubstructMatchParameters &params, int numThreads=-1) const
overload
std::vector< unsigned int > getMatches(const TautomerQuery &query, unsigned int startIdx, unsigned int endIdx, const SubstructMatchParameters &params, int numThreads=-1, int maxResults=-1) const
overload
void toStream(std::ostream &ss) const
serializes (pickles) to a stream
boost::shared_ptr< ROMol > operator[](unsigned int idx)
Returns the molecule at the given index.
const boost::shared_ptr< FPHolderBase > & getFpHolder() const
Get the underlying molecule holder implementation.
const boost::shared_ptr< MolHolderBase > & getMolHolder() const
std::vector< unsigned int > & getSearchOrder()
ExplicitBitVect * makeFingerprint(const ROMol &m) const override
Caller owns the vector!
TautomerPatternHolder(unsigned int numBits)
Class to allow us to throw a ValueError from C++ and have it make it back to Python.
Definition Exceptions.h:40
#define RDKIT_SUBSTRUCTLIBRARY_EXPORT
Definition export.h:529
RDKit::RWMol * SmilesToMol(const std::string &smi, const SmilesParserParams &ps)
Std stuff.
RDKIT_FINGERPRINTS_EXPORT ExplicitBitVect * PatternFingerprintMol(const ROMol &mol, unsigned int fpSize=2048, std::vector< unsigned int > *atomCounts=nullptr, ExplicitBitVect *setOnlyBits=nullptr, bool tautomericFingerprint=false)
Generates a topological fingerprint for a molecule using a series of pre-defined structural patterns.
RDKIT_SUBSTRUCTLIBRARY_EXPORT bool SubstructLibraryCanSerialize()
bool recursionPossible
Allow recursive queries.