RDKit
Open-source cheminformatics and machine learning.
Loading...
Searching...
No Matches
FPBReader.h
Go to the documentation of this file.
1//
2// Copyright (c) 2016 Greg Landrum
3//
4// @@ All Rights Reserved @@
5// This file is part of the RDKit.
6// The contents are covered by the terms of the BSD license
7// which is included in the file license.txt, found at the root
8// of the RDKit source tree.
9//
10#include <RDGeneral/export.h>
11#ifndef RD_FPBREADER_H_DEC2015
12#define RD_FPBREADER_H_DEC2015
13/*! \file FPBReader.h
14
15 \brief contains a simple class for reading and searching FPB files
16
17 \b Note that this functionality is experimental and the API may change
18 in future releases.
19*/
20
21#include <iostream>
22#include <fstream>
23#include <sstream>
24#include <string>
27
28#include <cstdint>
29#include <boost/shared_ptr.hpp>
30#include <boost/shared_array.hpp>
31
32namespace RDKit {
33namespace detail {
34struct FPBReader_impl;
35}
36
37//! class for reading and searching FPB files
38/*!
39 basic usage:
40 \code
41 FPBReader reader("foo.fpb");
42 reader.init();
43 boost::shared_ptr<ExplicitBitVect> ebv = reader.getFP(95);
44 std::vector<std::pair<double, unsigned int> > nbrs =
45 reader.getTanimotoNeighbors(*ebv.get(), 0.70);
46 \endcode
47
48 \b Note: this functionality is experimental and the API may change
49 in future releases.
50
51 <b>Note on thread safety</b>
52 Operations that involve reading from the FPB file are not thread safe.
53 This means that the \c init() method is not thread safe and none of the
54 search operations are thread safe when an \c FPBReader is initialized in
55 \c lazyRead mode.
56
57*/
59 public:
61
62 //! ctor for reading from a named file
63 /*!
64 \param fname the name of the file to reads
65 \param lazyRead if set to \c false all fingerprints from the file will be read
66 into memory when \c init() is called.
67 */
68 FPBReader(const char *fname, bool lazyRead = false) {
69 _initFromFilename(fname, lazyRead);
70 }
71 //! \overload
72 FPBReader(const std::string &fname, bool lazyRead = false) {
73 _initFromFilename(fname.c_str(), lazyRead);
74 }
75 //! ctor for reading from an open istream
76 /*!
77 \param inStream the stream to read from
78 \param takeOwnership if set, we will take over ownership of the stream pointer
79 \param lazyRead if set to \c false all fingerprints from the file will be read
80 into memory when \c init() is called.
81
82 Some additional notes:
83 - if \c lazyRead is set, \c inStream must support the \c seekg() and \c
84 tellg() operations.
85
86 */
87 FPBReader(std::istream *inStream, bool takeOwnership = true,
88 bool lazyRead = false)
89 : dp_istrm(inStream),
90 dp_impl(nullptr),
91 df_owner(takeOwnership),
92 df_init(false),
93 df_lazyRead(lazyRead) {}
95 destroy();
96 if (df_owner) {
97 delete dp_istrm;
98 }
99 dp_istrm = nullptr;
100 df_init = false;
101 }
102
103 //! Read the data from the file and initialize internal data structures
104 /*!
105 This must be called before most of the other methods of this class.
106
107 Some notes:
108 \li if \c lazyRead is not set, all fingerprints will be read into memory. This
109 can require substantial amounts of memory for large files.
110 \li For large files, this can take a long time.
111 \li If \c lazyRead and \c takeOwnership are both \c false it is safe to close
112 and delete inStream after calling \c init()
113 */
114 void init();
115 //! cleanup
116 /*!
117 Cleans up whatever memory was allocated during init()
118 */
119 void cleanup() {
120 if (!df_init) {
121 return;
122 }
123 destroy();
124 df_init = false;
125 }
126 //! returns the requested fingerprint as an \c ExplicitBitVect
127 boost::shared_ptr<ExplicitBitVect> getFP(unsigned int idx) const;
128 //! returns the requested fingerprint as an array of bytes
129 boost::shared_array<std::uint8_t> getBytes(unsigned int idx) const;
130
131 //! returns the id of the requested fingerprint
132 std::string getId(unsigned int idx) const;
133 //! returns the fingerprint and id of the requested fingerprint
134 std::pair<boost::shared_ptr<ExplicitBitVect>, std::string> operator[](
135 unsigned int idx) const {
136 return std::make_pair(getFP(idx), getId(idx));
137 }
138
139 //! returns beginning and end indices of fingerprints having on-bit counts
140 //! within the range (including end points)
141 std::pair<unsigned int, unsigned int> getFPIdsInCountRange(
142 unsigned int minCount, unsigned int maxCount);
143
144 //! returns the number of fingerprints
145 unsigned int length() const;
146 //! returns the number of bits in our fingerprints
147 unsigned int nBits() const;
148
149 //! returns the tanimoto similarity between the specified fingerprint and the
150 //! provided fingerprint
151 double getTanimoto(unsigned int idx, const std::uint8_t *bv) const;
152 //! \overload
153 double getTanimoto(unsigned int idx,
154 boost::shared_array<std::uint8_t> bv) const {
155 return getTanimoto(idx, bv.get());
156 }
157 //! \overload
158 double getTanimoto(unsigned int idx, const ExplicitBitVect &ebv) const;
159
160 //! returns tanimoto neighbors that are within a similarity threshold
161 /*!
162 The result vector of (similarity,index) pairs is sorted in order
163 of decreasing similarity
164
165 \param bv the query fingerprint
166 \param threshold the minimum similarity to return
167 \param usePopcountScreen if this is true (the default) the popcount of the
168 neighbors will be used to reduce the number of calculations that need
169 to be done
170
171 */
172 std::vector<std::pair<double, unsigned int>> getTanimotoNeighbors(
173 const std::uint8_t *bv, double threshold = 0.7,
174 bool usePopcountScreen = true) const;
175 //! \overload
176 std::vector<std::pair<double, unsigned int>> getTanimotoNeighbors(
177 boost::shared_array<std::uint8_t> bv, double threshold = 0.7,
178 bool usePopcountScreen = true) const {
179 return getTanimotoNeighbors(bv.get(), threshold, usePopcountScreen);
180 }
181 //! \overload
182 std::vector<std::pair<double, unsigned int>> getTanimotoNeighbors(
183 const ExplicitBitVect &ebv, double threshold = 0.7,
184 bool usePopcountScreen = true) const;
185
186 //! returns the Tversky similarity between the specified fingerprint and the
187 //! provided fingerprint
188 /*!
189
190 \param idx the fingerprint to compare to
191 \param bv the query fingerprint
192 \param ca the Tversky a coefficient
193 \param cb the Tversky a coefficient
194
195 */
196 double getTversky(unsigned int idx, const std::uint8_t *bv, double ca,
197 double cb) const;
198 //! \overload
199 double getTversky(unsigned int idx, boost::shared_array<std::uint8_t> bv,
200 double ca, double cb) const {
201 return getTversky(idx, bv.get(), ca, cb);
202 }
203 //! \overload
204 double getTversky(unsigned int idx, const ExplicitBitVect &ebv, double ca,
205 double cb) const;
206
207 //! returns Tversky neighbors that are within a similarity threshold
208 /*!
209 The result vector of (similarity,index) pairs is sorted in order
210 of decreasing similarity
211
212 \param bv the query fingerprint
213 \param ca the Tversky a coefficient
214 \param cb the Tversky a coefficient
215 \param threshold the minimum similarity to return
216 \param usePopcountScreen if this is true (the default) the popcount of the
217 neighbors will be used to reduce the number of calculations that need
218 to be done
219
220 */
221 std::vector<std::pair<double, unsigned int>> getTverskyNeighbors(
222 const std::uint8_t *bv, double ca, double cb, double threshold = 0.7,
223 bool usePopcountScreen = true) const;
224 //! \overload
225 std::vector<std::pair<double, unsigned int>> getTverskyNeighbors(
226 boost::shared_array<std::uint8_t> bv, double ca, double cb,
227 double threshold = 0.7, bool usePopcountScreen = true) const {
228 return getTverskyNeighbors(bv.get(), ca, cb, threshold, usePopcountScreen);
229 }
230 //! \overload
231 std::vector<std::pair<double, unsigned int>> getTverskyNeighbors(
232 const ExplicitBitVect &ebv, double ca, double cb, double threshold = 0.7,
233 bool usePopcountScreen = true) const;
234
235 //! returns indices of all fingerprints that completely contain this one
236 /*! (i.e. where all the bits set in the query are also set in the db
237 molecule)
238 */
239 std::vector<unsigned int> getContainingNeighbors(
240 const std::uint8_t *bv) const;
241 //! \overload
242 std::vector<unsigned int> getContainingNeighbors(
243 boost::shared_array<std::uint8_t> bv) const {
244 return getContainingNeighbors(bv.get());
245 }
246 //! \overload
247 std::vector<unsigned int> getContainingNeighbors(
248 const ExplicitBitVect &ebv) const;
249
250 private:
251 std::istream *dp_istrm{nullptr};
252 detail::FPBReader_impl *dp_impl{nullptr}; // implementation details
253 bool df_owner{false};
254 bool df_init{false};
255 bool df_lazyRead{false};
256
257 // disable automatic copy constructors and assignment operators
258 // for this class and its subclasses. They will likely be
259 // carrying around stream pointers and copying those is a recipe
260 // for disaster.
261 FPBReader(const FPBReader &);
262 FPBReader &operator=(const FPBReader &);
263 void destroy();
264 void _initFromFilename(const char *fname, bool lazyRead) {
265 std::istream *tmpStream = static_cast<std::istream *>(
266 new std::ifstream(fname, std::ios_base::binary));
267 if (!(*tmpStream) || (tmpStream->bad())) {
268 std::ostringstream errout;
269 errout << "Bad input file " << fname;
270 delete tmpStream;
271 throw BadFileException(errout.str());
272 }
273 dp_istrm = tmpStream;
274 dp_impl = nullptr;
275 df_owner = true;
276 df_init = false;
277 df_lazyRead = lazyRead;
278 }
279};
280} // namespace RDKit
281#endif
a class for bit vectors that are densely occupied
class for reading and searching FPB files
Definition FPBReader.h:58
std::vector< unsigned int > getContainingNeighbors(boost::shared_array< std::uint8_t > bv) const
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition FPBReader.h:242
std::pair< unsigned int, unsigned int > getFPIdsInCountRange(unsigned int minCount, unsigned int maxCount)
std::vector< std::pair< double, unsigned int > > getTanimotoNeighbors(const std::uint8_t *bv, double threshold=0.7, bool usePopcountScreen=true) const
returns tanimoto neighbors that are within a similarity threshold
std::vector< std::pair< double, unsigned int > > getTanimotoNeighbors(const ExplicitBitVect &ebv, double threshold=0.7, bool usePopcountScreen=true) const
This is an overloaded member function, provided for convenience. It differs from the above function o...
std::vector< std::pair< double, unsigned int > > getTverskyNeighbors(const std::uint8_t *bv, double ca, double cb, double threshold=0.7, bool usePopcountScreen=true) const
returns Tversky neighbors that are within a similarity threshold
void cleanup()
cleanup
Definition FPBReader.h:119
std::vector< unsigned int > getContainingNeighbors(const std::uint8_t *bv) const
returns indices of all fingerprints that completely contain this one
double getTversky(unsigned int idx, const std::uint8_t *bv, double ca, double cb) const
std::vector< std::pair< double, unsigned int > > getTverskyNeighbors(boost::shared_array< std::uint8_t > bv, double ca, double cb, double threshold=0.7, bool usePopcountScreen=true) const
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition FPBReader.h:225
unsigned int length() const
returns the number of fingerprints
boost::shared_array< std::uint8_t > getBytes(unsigned int idx) const
returns the requested fingerprint as an array of bytes
double getTanimoto(unsigned int idx, const std::uint8_t *bv) const
std::vector< std::pair< double, unsigned int > > getTverskyNeighbors(const ExplicitBitVect &ebv, double ca, double cb, double threshold=0.7, bool usePopcountScreen=true) const
This is an overloaded member function, provided for convenience. It differs from the above function o...
boost::shared_ptr< ExplicitBitVect > getFP(unsigned int idx) const
returns the requested fingerprint as an ExplicitBitVect
double getTanimoto(unsigned int idx, boost::shared_array< std::uint8_t > bv) const
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition FPBReader.h:153
double getTversky(unsigned int idx, boost::shared_array< std::uint8_t > bv, double ca, double cb) const
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition FPBReader.h:199
std::pair< boost::shared_ptr< ExplicitBitVect >, std::string > operator[](unsigned int idx) const
returns the fingerprint and id of the requested fingerprint
Definition FPBReader.h:134
std::vector< std::pair< double, unsigned int > > getTanimotoNeighbors(boost::shared_array< std::uint8_t > bv, double threshold=0.7, bool usePopcountScreen=true) const
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition FPBReader.h:176
double getTversky(unsigned int idx, const ExplicitBitVect &ebv, double ca, double cb) const
This is an overloaded member function, provided for convenience. It differs from the above function o...
FPBReader(std::istream *inStream, bool takeOwnership=true, bool lazyRead=false)
ctor for reading from an open istream
Definition FPBReader.h:87
FPBReader(const char *fname, bool lazyRead=false)
ctor for reading from a named file
Definition FPBReader.h:68
FPBReader(const std::string &fname, bool lazyRead=false)
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition FPBReader.h:72
unsigned int nBits() const
returns the number of bits in our fingerprints
std::vector< unsigned int > getContainingNeighbors(const ExplicitBitVect &ebv) const
This is an overloaded member function, provided for convenience. It differs from the above function o...
double getTanimoto(unsigned int idx, const ExplicitBitVect &ebv) const
This is an overloaded member function, provided for convenience. It differs from the above function o...
std::string getId(unsigned int idx) const
returns the id of the requested fingerprint
void init()
Read the data from the file and initialize internal data structures.
#define RDKIT_DATASTRUCTS_EXPORT
Definition export.h:81
Std stuff.
bool rdvalue_is(const RDValue_cast_t)
RDKIT_FINGERPRINTS_EXPORT ExplicitBitVect * getFP(const ROMol &mol, FPType fPType)