RDKit
Open-source cheminformatics and machine learning.
Loading...
Searching...
No Matches
SynthonSpace.h
Go to the documentation of this file.
1//
2// Copyright (C) David Cosgrove 2024.
3//
4// @@ All Rights Reserved @@
5// This file is part of the RDKit.
6// The contents are covered by the terms of the BSD license
7// which is included in the file license.txt, found at the root
8// of the RDKit source tree.
9//
10// This file and others here contain an implementation of
11// synthonspace substructure search similar to that described in
12// 'Fast Substructure Search in Combinatorial Library Spaces',
13// Thomas Liphardt and Thomas Sander,
14// J. Chem. Inf. Model. 2023, 63, 16, 5133–5141
15// https://doi.org/10.1021/acs.jcim.3c00290
16
17#ifndef RDKIT_SYNTHONSPACE_H
18#define RDKIT_SYNTHONSPACE_H
19
20/*! \file SynthonSpace.h
21
22 \brief contains a class for searching combinatorial libraries in
23 Synthon format such as Enamine REAL.
24
25 \b Note that this functionality is experimental and the API may change
26 in future releases.
27*/
28
29#include <map>
30#include <string>
31#include <vector>
32
33#include <boost/dynamic_bitset.hpp>
34
35#include <RDGeneral/export.h>
41
42namespace RDKit {
43class ROMol;
44
45namespace RascalMCES {
46struct RascalOptions;
47}
48
49namespace SynthonSpaceSearch {
50
51// This the maximum number of connectors that we can deal with at the moment.
52// In reality, there may be fewer than this. However, the key limit is in
53// The symbols used for the connectors in Enamine REAL etc.
54const std::vector<std::string> CONNECTOR_SYMBOLS{"[U]", "[Np]", "[Pu]", "[Am]"};
55constexpr unsigned int MAX_CONNECTOR_NUM{4};
56
58 std::int64_t maxHits{1000}; // The maximum number of hits to return.
59 // Use -1 for no maximum.
60 std::uint64_t maxNumFragSets{
61 100000}; // The maximum number of fragment sets the query can
62 // be broken into. Big molecules will create huge
63 // numbers of fragment sets that may cause excessive
64 // memory use. If the number of fragment sets hits this
65 // number, fragmentation stops and the search results
66 // will likely be incomplete.
67 std::int64_t toTryChunkSize{2500000}; // For similarity searching, especially
68 // fingerprint similarity, there can be a
69 // very large number of possible hits to
70 // screen which can use a lot of memory and
71 // crash the program. It will also be very
72 // slow. To alleviate the memory use, the
73 // possible hits are processed in chunks.
74 // This parameter sets the chunk size.
75
76 std::int64_t hitStart{0}; // Sequence number of hit to start from. So that
77 // you can return the next N hits of a search
78 // having already obtained N-1.
79 bool randomSample{false}; // If true, returns a random sample of the hit
80 // hits, up to maxHits in number.
81 int randomSeed{-1}; // Seed for random-number generator. -1 means use
82 // a random seed (std::random_device).
83 bool buildHits{true}; // If false, reports the maximum number of hits that
84 // the search could produce, but doesn't return them.
85 int numRandomSweeps{10}; // The random sampling doesn't always produce the
86 // required number of hits in 1 go. This parameter
87 // controls how many loops it makes to try and get
88 // the hits before giving up.
89 double similarityCutoff{0.5}; // Similarity cutoff for returning hits by
90 // fingerprint similarity. The default is
91 // appropriate for a Morgan fingerprint of
92 // radius=2, it may need changing for other
93 // fingerprint types.
94 double fragSimilarityAdjuster{
95 0.1}; // Similarity values for fragments are generally low
96 // due to low bit densities. For the fragment
97 // matching, reduce the similarity cutoff
98 // by this amount. A higher number will give slower search
99 // times, a lower number will give faster searches at the
100 // risk of missing some hits. The value you use should have
101 // a positive correlation with your FOMO.
102 double approxSimilarityAdjuster{
103 0.1}; // The fingerprint search uses an approximate similarity method
104 // before building a product and doing a final check. The
105 // similarityCutoff is reduced by this value for the approximate
106 // check. A lower value will give faster run times at the
107 // risk of missing some hits. The value you use should have a
108 // positive correlation with your FOMO. The default is
109 // appropriate for Morgan fingerprints. With RDKit fingerprints,
110 // 0.05 is adequate, and higher than that has been seen to
111 // produce long run times.
112 std::uint64_t timeOut{600}; // Maximum number of seconds to spend on a single
113 // search. 0 means no maximum.
114 int numThreads = 1; // The number of threads to use. If > 0, will use that
115 // number. If <= 0, will use the number of hardware
116 // threads plus this number. So if the number of
117 // hardware threads is 8, and numThreads is -1, it will
118 // use 7 threads.
119};
120
121class Synthon;
122
124 friend class SynthonSet;
128
129 public:
130 explicit SynthonSpace() = default;
131 ~SynthonSpace() = default;
132 SynthonSpace(const SynthonSpace &other) = delete;
133 SynthonSpace &operator=(const SynthonSpace &other) = delete;
134 /*!
135 * Get the number of different reactions in the SynthonSpace.
136 *
137 * @return int
138 */
139 size_t getNumReactions() const;
140 /*!
141 * Get a list of the names of all the reactions in the SynthonSpace.
142 *
143 * @return
144 */
145 std::vector<std::string> getReactionNames() const;
146 const std::shared_ptr<SynthonSet> getReaction(std::string reactionName);
147 // The Synthons have a PatternFingerprint for screening in substructure
148 // searches. It's important that the screening process creates ones
149 // of the same size, so this finds out what size that is.
150 unsigned int getPatternFPSize() const;
151 // Likewise for the fingerprints used for similarity searching
152 unsigned int getFPSize() const;
153
154 std::string getInputFileName() const;
155
156 /*!
157 * Get the total number of products that the SynthonSpace could produce.
158 *
159 * @return std::int64_t
160 */
161 std::uint64_t getNumProducts() const;
162
163 /*!
164 * Get the info string for the fingerprint generator used to
165 * generate the stored fingerprints, so the user can query with
166 * the same type.
167 *
168 * @return
169 */
170 std::string getSynthonFingerprintType() const { return d_fpType; }
171
172 /*!
173 * Perform a substructure search with the given query molecule across
174 * the synthonspace library. Duplicate SMILES strings produced by
175 * different reactions will be returned.
176 *
177 * @param query : query molecule
178 * @param params : (optional) settings for the search
179 * @return : the hits as a SearchResults object.
180 */
182 const ROMol &query,
185
186 /*!
187 * Perform a substructure search with the given generalized query
188 * molecule across the synthonspace library. Duplicate SMILES strings
189 * produced by different reactions will be returned.
190 *
191 * @param query : query molecule
192 * @param params : (optional) settings for the search
193 * @return : the hits as a SearchResults object.
194 */
199
200 /*!
201 * Perform a fingerprint similarity search with the given query molecule
202 * across the synthonspace library. Duplicate SMILES strings produced by
203 * different reactions will be returned.
204 * @param query : query molecule
205 * @param fpGen: a FingerprintGenerator object that will provide the
206 * fingerprints for the similarity calculation
207 * @param params : (optional) settings for the search
208 * @return : the hits as a SearchResults object.
209 */
211 const ROMol &query, const FingerprintGenerator<std::uint64_t> &fpGen,
213
214 // Perform a RASCAL similarity search with the given query molecule
215 // across the synthonspace library. Duplicate SMILES strings produced by
216 // different reactions will be returned.
217 /*!
218 *
219 * @param query : query molecule
220 * @param rascalOptions: RASCAL options. The similarityThreshold value
221 * in the rascalOptions will be used rather than
222 * params.similarityCutoff,
223 * but params.fragSimilarityAdjuster will be used
224 * to adjust the threshold for the fragment
225 * comparisons.
226 * @param params : (optional) settings for the search
227 * @return : the hits as a SearchResults object.
228 */
230 const ROMol &query, const RascalMCES::RascalOptions &rascalOptions,
232
233 /*!
234 *
235 * @param inFilename: name of the file containing the synthon-based library.
236 *
237 * The original format is:
238 * all lines are tab-separated
239 * first line:SMILES synton_id synton# reaction_id
240 * Note the spelling "synton" from the original paper/example file.
241 * Subsequent lines have a single reagent e.g.
242 * OCC([U])=NN=[Np] 1-1 0 triazole-1
243 * C1CCCC1N([Pu])[U] 2-1 1 triazole-1
244 * CC1CCN(C1)C(=[Np])[Pu] 3-1 2 triazole-1
245 *
246 * Other acceptable formats are as above, but with a 5th column "release":
247 * SMILES synton_id synton# reaction_id release
248 *
249 * or a comma-separated equivalent of the first format:
250 * SMILES,synton_id,synton_role,reaction_id
251 * but with the 3rd column named differently but with the same meaning.
252 * The formatting of the first 2 formats has been relaxed such that any
253 * whitespace may be used as the field separator.
254 *
255 * Attachment points are U, Np, Pu and Am for up to 4 synthons per reaction.
256 * A product is created by taking a synthon from each synton# value and
257 * combining by replacing matching trans-uranic elements and replacing them
258 * with a direct bond of the appropriate type.
259 * A more (for RDKit) conventional connection flag of isotope labelled
260 * dummy atoms is also accepted ([1*] etc.).
261 * Throws a std::runtime_error if it doesn't think the format is correct,
262 * which it does by checking that the first line is as above and subsequent
263 * lines have appropriate number of fields.
264 * If it receives a SIGINT, returns cancelled=true.
265 */
266 void readTextFile(const std::string &inFilename, bool &cancelled);
267
268 /*!
269 * Writes to a binary DB File in our format.
270 *
271 * @param outFilename: the name of the file to write.
272 */
273 void writeDBFile(const std::string &outFilename) const;
274
275 /*!
276 * Reads from a binary DB File in our format.
277 *
278 * @param inFilename: the name of the file to read.
279 * @param numThreads: number of threads to use in reading. If negative,
280 * adds the number to the number of hardware threads
281 * available.
282 */
283 void readDBFile(const std::string &inFilename, int numThreads = 1);
284
285 /*!
286 * Write a summary of the SynthonSpace to given stream.
287 *
288 * @param os: stream
289 */
290 void summarise(std::ostream &os);
291
292 /*!
293 * Writes the enumerated library to file in SMILES format
294 * (1 compound per line, SMILES name)
295 *
296 * @param outFilename: name of the file to write
297 */
298 void writeEnumeratedFile(const std::string &outFilename) const;
299
300 /*!
301 * Create the fingerprints for the synthons ready for fingerprint searches.
302 * Will be done by the fingerprint search if not done ahead of time.
303 *
304 * @param fpGen: a fingerprint generator of the appropriate type
305 */
308
309 protected:
310 unsigned int getMaxNumSynthons() const { return d_maxNumSynthons; }
311
312 bool hasFingerprints() const;
313
315
316 // Take the SMILES for a Synthon and if it's not in
317 // d_synthonPool make it and add it. If it is in the pool,
318 // just look it up. Either way, return a pointer to the
319 // Synthon.
320 Synthon *addSynthonToPool(const std::string &smiles);
321 std::shared_ptr<SynthonSet> addReactionToPool(
322 const std::string &reactionName);
323
324 // Just do the lookup, and return nullptr if not found.
325 Synthon *getSynthonFromPool(const std::string &smiles) const;
326
327 private:
328 std::string d_fileName;
329 // The reactions, keyed on their IDs as the first value
330 // in the pair.
331 std::vector<std::pair<std::string, std::shared_ptr<SynthonSet>>> d_reactions;
332 // Keep the value of the maximum number of synthon sets used by
333 // any of the reactions. There's no point fragmenting any
334 // query into more than this number of fragments. Shouldn't
335 // ever be higher than 4 at present.
336 unsigned int d_maxNumSynthons{0};
337 std::uint64_t d_numProducts{0};
338
339 // This is actually 1000 * major version + 10 * minor version
340 // and hence the full version number.
341 std::int32_t d_fileMajorVersion{-1};
342
343 // The pool of all synthons, keyed on SMILES string. Synthons
344 // are frequently re-used in different reactions, so this means
345 // they're only stored once. They will be sorted and searched
346 // for via first, which is its SMILES string.
347 std::vector<std::pair<std::string, std::unique_ptr<Synthon>>> d_synthonPool;
348
349 // For the similarity search, this records the generator used for
350 // creating synthon fingerprints that are read from a binary file.
351 std::string d_fpType;
352
353 SearchResults extendedSearch(const MolBundle &query,
354 const SubstructMatchParameters &matchParams,
355 const SynthonSpaceSearchParams &params);
356 SearchResults extendedSearch(
357 const GeneralizedSubstruct::ExtendedQueryMol::TautomerBundle_T &query,
358 const SubstructMatchParameters &matchParams,
359 const SynthonSpaceSearchParams &params);
360 SearchResults extendedSearch(const TautomerQuery &query,
361 const SubstructMatchParameters &matchParams,
362 const SynthonSpaceSearchParams &params);
363};
364
365/*!
366 * Convert the text file into the binary DB file in our format.
367 * Equivalent to readTextFile() followed by writeDBFile().
368 * If a fingerprint generator is provided, fingerprints will
369 * be created for all the synthons, which can be time-consuming.
370 * @param inFilename name of the text file to read
371 * @param outFilename name of the binary file to write
372 * @param cancelled whether it received a SIGINT
373 * @param fpGen optional fingerprint generator
374 */
376 const std::string &inFilename, const std::string &outFilename,
377 bool &cancelled,
379
380/*!
381 * Format an integer with spaces every 3 digits for ease
382 * of reading.
383 *
384 * @return std::string
385 */
386std::string formattedIntegerString(std::int64_t value);
387
388} // namespace SynthonSpaceSearch
389} // namespace RDKit
390
391#endif // RDKIT_SYNTHONSPACE_H
class that generates same fingerprint style for different output formats
std::string getSynthonFingerprintType() const
void readTextFile(const std::string &inFilename, bool &cancelled)
SearchResults substructureSearch(const GeneralizedSubstruct::ExtendedQueryMol &query, const SubstructMatchParameters &matchParams=SubstructMatchParameters(), const SynthonSpaceSearchParams &params=SynthonSpaceSearchParams())
const std::shared_ptr< SynthonSet > getReaction(std::string reactionName)
SearchResults rascalSearch(const ROMol &query, const RascalMCES::RascalOptions &rascalOptions, const SynthonSpaceSearchParams &params=SynthonSpaceSearchParams())
Synthon * addSynthonToPool(const std::string &smiles)
SynthonSpace & operator=(const SynthonSpace &other)=delete
SynthonSpace(const SynthonSpace &other)=delete
void writeEnumeratedFile(const std::string &outFilename) const
void writeDBFile(const std::string &outFilename) const
SearchResults fingerprintSearch(const ROMol &query, const FingerprintGenerator< std::uint64_t > &fpGen, const SynthonSpaceSearchParams &params=SynthonSpaceSearchParams())
std::shared_ptr< SynthonSet > addReactionToPool(const std::string &reactionName)
void readDBFile(const std::string &inFilename, int numThreads=1)
SearchResults substructureSearch(const ROMol &query, const SubstructMatchParameters &matchParams=SubstructMatchParameters(), const SynthonSpaceSearchParams &params=SynthonSpaceSearchParams())
Synthon * getSynthonFromPool(const std::string &smiles) const
std::vector< std::string > getReactionNames() const
void buildSynthonFingerprints(const FingerprintGenerator< std::uint64_t > &fpGen)
#define RDKIT_SYNTHONSPACESEARCH_EXPORT
Definition export.h:545
RDKIT_SYNTHONSPACESEARCH_EXPORT void convertTextToDBFile(const std::string &inFilename, const std::string &outFilename, bool &cancelled, const FingerprintGenerator< std::uint64_t > *fpGen=nullptr)
std::string formattedIntegerString(std::int64_t value)
constexpr unsigned int MAX_CONNECTOR_NUM
const std::vector< std::string > CONNECTOR_SYMBOLS
Std stuff.
bool rdvalue_is(const RDValue_cast_t)