RDKit
Open-source cheminformatics and machine learning.
Loading...
Searching...
No Matches
SynthonSpace.h
Go to the documentation of this file.
1//
2// Copyright (C) David Cosgrove 2024.
3//
4// @@ All Rights Reserved @@
5// This file is part of the RDKit.
6// The contents are covered by the terms of the BSD license
7// which is included in the file license.txt, found at the root
8// of the RDKit source tree.
9//
10#ifndef RDKIT_SYNTHONSPACE_H
11#define RDKIT_SYNTHONSPACE_H
12
13/*! \file SynthonSpace.h
14
15 \brief contains a class for searching combinatorial libraries in
16 Synthon format such as Enamine REAL.
17
18 \b Note that this functionality is experimental and the API may change
19 in future releases.
20*/
21
22#include <map>
23#include <string>
24#include <vector>
25
26#include <boost/dynamic_bitset.hpp>
27
28#include <RDGeneral/export.h>
32
33namespace RDKit {
34class ROMol;
35
36namespace SynthonSpaceSearch {
37
38// This the maximum number of connectors that we can deal with at the moment.
39// In reality, there may be fewer than this. However, the key limit is in
40// The symbols used for the connectors in Enamine REAL etc.
41const std::vector<std::string> CONNECTOR_SYMBOLS{"[U]", "[Np]", "[Pu]", "[Am]"};
42constexpr unsigned int MAX_CONNECTOR_NUM{4};
43
45 int maxBondSplits{MAX_CONNECTOR_NUM}; // The maximum number of bonds to break
46 // in the query. It should be no more
47 // than the maximum number of connector
48 // types in the SynthonSpace. At
49 // present this is 4. Specifying more
50 // than that will not matter as it will
51 // be reduced to 4. Likewise, values
52 // lower than 1 will be increased to 1.
53 std::int64_t maxHits{1000}; // The maximum number of hits to return. Use -1
54 // for no maximum.
55 std::int64_t hitStart{0}; // Sequence number of hit to start from. So that
56 // you can return the next N hits of a search
57 // having already obtained N-1.
58 bool randomSample{false}; // If true, returns a random sample of the hit
59 // hits, up to maxHits in number.
60 int randomSeed{-1}; // Seed for random-number generator. -1 means use
61 // a random seed (std::random_device).
62 bool buildHits{true}; // If false, reports the maximum number of hits that
63 // the search could produce, but doesn't return them.
64 int numRandomSweeps{10}; // The random sampling doesn't always produce the
65 // required number of hits in 1 go. This parameter
66 // controls how many loops it makes to try and get
67 // the hits before giving up.
68 double similarityCutoff{0.5}; // Similarity cutoff for returning hits by
69 // fingerprint similarity. The default is
70 // appropriate for a Morgan fingerprint of
71 // radius=2, it may need changing for other
72 // fingerprint types.
73 double fragSimilarityAdjuster{
74 0.1}; // Similarity values for fragments are generally low
75 // due to low bit densities. For the fragment
76 // matching, reduce the similarity cutoff
77 // by this amount. A higher number will give slower search
78 // times, a lower number will give faster searches at the
79 // risk of missing some hits. The value you use should have
80 // a positive correlation with your FOMO.
81};
82
83// Holds the information about a set of hits. The molecules can be built
84// by making all combinations of synthons, one taken from each synthon set.
86 std::string reactionId;
87 std::vector<boost::dynamic_bitset<>> synthonsToUse;
88 size_t numHits{0};
89};
90
92 public:
93 // Create the synthonspace from a file in the correct format.
94 explicit SynthonSpace() = default;
95 SynthonSpace(const SynthonSpace &other) = delete;
96 SynthonSpace &operator=(const SynthonSpace &other) = delete;
97 // Get the number of different reactions in the SynthonSpace.
98 /*!
99 *
100 * @return int
101 */
102 size_t getNumReactions() const { return d_reactions.size(); }
103 const std::map<std::string, std::unique_ptr<SynthonSet>> &getReactions()
104 const {
105 return d_reactions;
106 }
107
108 // Get the total number of products that the SynthonSpace could produce.
109 /*!
110 *
111 * @return std::int64_t
112 */
113 std::int64_t getNumProducts() const;
114
115 std::string getSynthonFingerprintType() const { return d_fpType; }
116
117 // Perform a substructure search with the given query molecule across
118 // the synthonspace library. Duplicate SMILES strings produced by different
119 // reactions will be returned.
120 /*!
121 *
122 * @param query : query molecule
123 * @param params : (optional) settings for the search
124 * @return : the hits as a SubstructureResults object.
125 */
127 const ROMol &query,
129
130 // Perform a fingerprint similarity search with the given query molecule
131 // across the synthonspace library. Duplicate SMILES strings produced by
132 // different reactions will be returned.
133 /*!
134 *
135 * @param query : query molecule
136 * @param fpGen: a FingerprintGenerator object that will provide the
137 * fingerprints for the similarity calculation
138 * @param params : (optional) settings for the search
139 * @return : the hits as a SubstructureResults object.
140 */
142 const ROMol &query, const FingerprintGenerator<std::uint64_t> &fpGen,
144
145 /*!
146 *
147 * @param inFilename: name of the file containing the synthon-based library.
148 *
149 * The original format is:
150 * all lines are tab-separated
151 * first line:SMILES synton_id synton# reaction_id
152 * Note the spelling "synton" from the original paper/example file.
153 * Subsequent lines have a single reagent e.g.
154 * OCC([U])=NN=[Np] 1-1 0 triazole-1
155 * C1CCCC1N([Pu])[U] 2-1 1 triazole-1
156 * CC1CCN(C1)C(=[Np])[Pu] 3-1 2 triazole-1
157 *
158 * Other acceptable formats are as above, but with a 5th column "release":
159 * SMILES synton_id synton# reaction_id release
160 *
161 * or a comma-separated equivalent of the first format:
162 * SMILES,synton_id,synton_role,reaction_id
163 * but with the 3rd column named differently but with the same meaning.
164 * The formatting of the first 2 formats has been relaxed such that any
165 * whitespace may be used as the field separator.
166 *
167 * Attachment points are U, Np, Pu and Am for up to 4 synthons per reaction.
168 * A product is created by taking a synthon from each synton# value and
169 * combining by replacing matching trans-uranic elements and replacing them
170 * with a direct bond of the appropriate type.
171 * A more (for RDKit) conventional connection flag of isotope labelled
172 * dummy atoms is also accepted ([1*] etc.).
173 * Throws a std::runtime_error if it doesn't think the format is correct,
174 * which it does by checking that the first line is as above and subsequent
175 * lines have appropriate number of fields.
176 */
177 void readTextFile(const std::string &inFilename);
178
179 // Writes to/reads from a binary DB File in our format.
180 /*!
181 *
182 * @param outFilename: the name of the file to write.
183 */
184 void writeDBFile(const std::string &outFilename) const;
185 /*!
186 *
187 * @param inFilename: the name of the file to read.
188 */
189 void readDBFile(const std::string &inFilename);
190
191 // Write a summary of the SynthonSpace to given stream.
192 /*!
193 *
194 * @param os: stream
195 */
196 void summarise(std::ostream &os) const;
197
198 // Writes the enumerated library to file in SMILES format (1 compound
199 // per line, SMILES name
200 /*!
201 @param outFilename: name of the file to write
202 */
203 void writeEnumeratedFile(const std::string &outFilename) const;
204
205 bool hasFingerprints() const;
206 // Create the fingerprints for the synthons ready for fingerprint searches.
207 // Valid values of fpType as described by SynthonSpaceSearchParams.
210
211 private:
212 std::string d_fileName;
213 std::map<std::string, std::unique_ptr<SynthonSet>> d_reactions;
214
215 // For the similarity search, this records the generator used for
216 // creating synthon fingerprints that are read from a binary file.
217 std::string d_fpType;
218};
219
220} // namespace SynthonSpaceSearch
221} // namespace RDKit
222
223#endif // RDKIT_SYNTHONSPACE_H
class that generates same fingerprint style for different output formats
std::string getSynthonFingerprintType() const
const std::map< std::string, std::unique_ptr< SynthonSet > > & getReactions() const
void readTextFile(const std::string &inFilename)
void summarise(std::ostream &os) const
SynthonSpace & operator=(const SynthonSpace &other)=delete
SearchResults substructureSearch(const ROMol &query, const SynthonSpaceSearchParams &params=SynthonSpaceSearchParams())
SynthonSpace(const SynthonSpace &other)=delete
void writeEnumeratedFile(const std::string &outFilename) const
void writeDBFile(const std::string &outFilename) const
SearchResults fingerprintSearch(const ROMol &query, const FingerprintGenerator< std::uint64_t > &fpGen, const SynthonSpaceSearchParams &params=SynthonSpaceSearchParams())
void readDBFile(const std::string &inFilename)
void buildSynthonFingerprints(const FingerprintGenerator< std::uint64_t > &fpGen)
#define RDKIT_SYNTHONSPACESEARCH_EXPORT
Definition export.h:545
constexpr unsigned int MAX_CONNECTOR_NUM
const std::vector< std::string > CONNECTOR_SYMBOLS
Std stuff.
std::vector< boost::dynamic_bitset<> > synthonsToUse