RDKit
Open-source cheminformatics and machine learning.
FPBReader.h
Go to the documentation of this file.
1 //
2 // Copyright (c) 2016 Greg Landrum
3 //
4 // @@ All Rights Reserved @@
5 // This file is part of the RDKit.
6 // The contents are covered by the terms of the BSD license
7 // which is included in the file license.txt, found at the root
8 // of the RDKit source tree.
9 //
10 #ifndef RD_FPBREADER_H_DEC2015
11 #define RD_FPBREADER_H_DEC2015
12 /*! \file FPBReader.h
13 
14  \brief contains a simple class for reading and searching FPB files
15 
16  \b Note that this functionality is experimental and the API may change
17  in future releases.
18 */
19 
20 #include <iostream>
21 #include <fstream>
22 #include <sstream>
23 #include <string>
26 
27 #include <boost/cstdint.hpp>
28 #include <boost/shared_ptr.hpp>
29 #include <boost/shared_array.hpp>
30 
31 namespace RDKit {
32 namespace detail {
33 struct FPBReader_impl;
34 }
35 
36 //! class for reading and searching FPB files
37 /*!
38  basic usage:
39  \code
40  FPBReader reader("foo.fpb");
41  reader.init();
42  boost::shared_ptr<ExplicitBitVect> ebv = reader.getFP(95);
43  std::vector<std::pair<double, unsigned int> > nbrs =
44  reader.getTanimotoNeighbors(*ebv.get(), 0.70);
45  \endcode
46 
47  \b Note: this functionality is experimental and the API may change
48  in future releases.
49 
50  <b>Note on thread safety</b>
51  Operations that involve reading from the FPB file are not thread safe.
52  This means that the \c init() method is not thread safe and none of the
53  search operations are thread safe when an \c FPBReader is initialized in
54  \c lazyRead mode.
55 
56 */
57 class FPBReader {
58  public:
60  : dp_istrm(NULL),
61  dp_impl(NULL),
62  df_owner(false),
63  df_init(false),
64  df_lazyRead(false){};
65  //! ctor for reading from a named file
66  /*!
67  \param fname the name of the file to reads
68  \param lazyRead if set to \c false all fingerprints from the file will be read
69  into memory when \c init() is called.
70  */
71  FPBReader(const char *fname, bool lazyRead = false) {
72  _initFromFilename(fname, lazyRead);
73  };
74  //! \overload
75  FPBReader(const std::string &fname, bool lazyRead = false) {
76  _initFromFilename(fname.c_str(), lazyRead);
77  };
78  //! ctor for reading from an open istream
79  /*!
80  \param inStream the stream to read from
81  \param takeOwnership if set, we will take over ownership of the stream pointer
82  \param lazyRead if set to \c false all fingerprints from the file will be read
83  into memory when \c init() is called.
84 
85  Some additional notes:
86  - if \c lazyRead is set, \c inStream must support the \c seekg() and \c
87  tellg() operations.
88 
89  */
90  FPBReader(std::istream *inStream, bool takeOwnership = true,
91  bool lazyRead = false)
92  : dp_istrm(inStream),
93  dp_impl(NULL),
94  df_owner(takeOwnership),
95  df_init(false),
96  df_lazyRead(lazyRead){};
98  destroy();
99  if (df_owner) delete dp_istrm;
100  dp_istrm = NULL;
101  df_init = false;
102  };
103 
104  //! Read the data from the file and initialize internal data structures
105  /*!
106  This must be called before most of the other methods of this clases.
107 
108  Some notes:
109  \li if \c lazyRead is not set, all fingerprints will be read into memory. This
110  can require substantial amounts of memory for large files.
111  \li For large files, this can take a long time.
112  \li If \c lazyRead and \c takeOwnership are both \c false it is safe to close
113  and delete inStream after calling \c init()
114  */
115  void init();
116  //! cleanup
117  /*!
118  Cleans up whatever memory was allocated during init()
119  */
120  void cleanup() {
121  if (!df_init) return;
122  destroy();
123  df_init = false;
124  };
125  //! returns the requested fingerprint as an \c ExplicitBitVect
126  boost::shared_ptr<ExplicitBitVect> getFP(unsigned int idx) const;
127  //! returns the requested fingerprint as an array of bytes
128  boost::shared_array<boost::uint8_t> getBytes(unsigned int idx) const;
129 
130  //! returns the id of the requested fingerprint
131  std::string getId(unsigned int idx) const;
132  //! returns the fingerprint and id of the requested fingerprint
133  std::pair<boost::shared_ptr<ExplicitBitVect>, std::string> operator[](
134  unsigned int idx) const {
135  return std::make_pair(getFP(idx), getId(idx));
136  };
137 
138  //! returns beginning and end indices of fingerprints having on-bit counts
139  //! within the range (including end points)
140  std::pair<unsigned int, unsigned int> getFPIdsInCountRange(
141  unsigned int minCount, unsigned int maxCount);
142 
143  //! returns the number of fingerprints
144  unsigned int length() const;
145  //! returns the number of bits in our fingerprints
146  unsigned int nBits() const;
147 
148  //! returns the tanimoto similarity between the specified fingerprint and the
149  //! provided fingerprint
150  double getTanimoto(unsigned int idx, const boost::uint8_t *bv) const;
151  //! \overload
152  double getTanimoto(unsigned int idx,
153  boost::shared_array<boost::uint8_t> bv) const {
154  return getTanimoto(idx, bv.get());
155  };
156  //! \overload
157  double getTanimoto(unsigned int idx, const ExplicitBitVect &ebv) const;
158 
159  //! returns tanimoto neighbors that are within a similarity threshold
160  /*!
161  The result vector of (similarity,index) pairs is sorted in order
162  of decreasing similarity
163 
164  \param bv the query fingerprint
165  \param threshold the minimum similarity to return
166  \param usePopcountScreen if this is true (the default) the popcount of the
167  neighbors will be used to reduce the number of calculations that need
168  to be done
169 
170  */
171  std::vector<std::pair<double, unsigned int> > getTanimotoNeighbors(
172  const boost::uint8_t *bv, double threshold = 0.7,
173  bool usePopcountScreen = true) const;
174  //! \overload
175  std::vector<std::pair<double, unsigned int> > getTanimotoNeighbors(
176  boost::shared_array<boost::uint8_t> bv, double threshold = 0.7,
177  bool usePopcountScreen = true) const {
178  return getTanimotoNeighbors(bv.get(), threshold, usePopcountScreen);
179  };
180  //! \overload
181  std::vector<std::pair<double, unsigned int> > getTanimotoNeighbors(
182  const ExplicitBitVect &ebv, double threshold = 0.7,
183  bool usePopcountScreen = true) const;
184 
185  //! returns the Tversky similarity between the specified fingerprint and the
186  //! provided fingerprint
187  /*!
188 
189  \param idx the fingerprint to compare to
190  \param bv the query fingerprint
191  \param ca the Tversky a coefficient
192  \param cb the Tversky a coefficient
193 
194  */
195  double getTversky(unsigned int idx, const boost::uint8_t *bv, double ca,
196  double cb) const;
197  //! \overload
198  double getTversky(unsigned int idx, boost::shared_array<boost::uint8_t> bv,
199  double ca, double cb) const {
200  return getTversky(idx, bv.get(), ca, cb);
201  };
202  //! \overload
203  double getTversky(unsigned int idx, const ExplicitBitVect &ebv, double ca,
204  double cb) const;
205 
206  //! returns Tversky neighbors that are within a similarity threshold
207  /*!
208  The result vector of (similarity,index) pairs is sorted in order
209  of decreasing similarity
210 
211  \param bv the query fingerprint
212  \param ca the Tversky a coefficient
213  \param cb the Tversky a coefficient
214  \param threshold the minimum similarity to return
215  \param usePopcountScreen if this is true (the default) the popcount of the
216  neighbors will be used to reduce the number of calculations that need
217  to be done
218 
219  */
220  std::vector<std::pair<double, unsigned int> > getTverskyNeighbors(
221  const boost::uint8_t *bv, double ca, double cb, double threshold = 0.7,
222  bool usePopcountScreen = true) const;
223  //! \overload
224  std::vector<std::pair<double, unsigned int> > getTverskyNeighbors(
225  boost::shared_array<boost::uint8_t> bv, double ca, double cb,
226  double threshold = 0.7, bool usePopcountScreen = true) const {
227  return getTverskyNeighbors(bv.get(), ca, cb, threshold, usePopcountScreen);
228  };
229  //! \overload
230  std::vector<std::pair<double, unsigned int> > getTverskyNeighbors(
231  const ExplicitBitVect &ebv, double ca, double cb, double threshold = 0.7,
232  bool usePopcountScreen = true) const;
233 
234  //! returns indices of all fingerprints that completely contain this one
235  /*! (i.e. where all the bits set in the query are also set in the db
236  molecule)
237  */
238  std::vector<unsigned int> getContainingNeighbors(
239  const boost::uint8_t *bv) const;
240  //! \overload
241  std::vector<unsigned int> getContainingNeighbors(
242  boost::shared_array<boost::uint8_t> bv) const {
243  return getContainingNeighbors(bv.get());
244  };
245  //! \overload
246  std::vector<unsigned int> getContainingNeighbors(
247  const ExplicitBitVect &ebv) const;
248 
249  private:
250  std::istream *dp_istrm;
251  detail::FPBReader_impl *dp_impl; // implementation details
252  bool df_owner;
253  bool df_init;
254  bool df_lazyRead;
255 
256  // disable automatic copy constructors and assignment operators
257  // for this class and its subclasses. They will likely be
258  // carrying around stream pointers and copying those is a recipe
259  // for disaster.
260  FPBReader(const FPBReader &);
261  FPBReader &operator=(const FPBReader &);
262  void destroy();
263  void _initFromFilename(const char *fname, bool lazyRead) {
264  std::istream *tmpStream = static_cast<std::istream *>(
265  new std::ifstream(fname, std::ios_base::binary));
266  if (!tmpStream || (!(*tmpStream)) || (tmpStream->bad())) {
267  std::ostringstream errout;
268  errout << "Bad input file " << fname;
269  throw BadFileException(errout.str());
270  }
271  dp_istrm = tmpStream;
272  dp_impl = NULL;
273  df_owner = true;
274  df_init = false;
275  df_lazyRead = lazyRead;
276  }
277 };
278 }
279 #endif
class for reading and searching FPB files
Definition: FPBReader.h:57
used by various file parsing classes to indicate a bad file
FPBReader(std::istream *inStream, bool takeOwnership=true, bool lazyRead=false)
ctor for reading from an open istream
Definition: FPBReader.h:90
std::vector< std::pair< double, unsigned int > > getTverskyNeighbors(boost::shared_array< boost::uint8_t > bv, double ca, double cb, double threshold=0.7, bool usePopcountScreen=true) const
Definition: FPBReader.h:224
FPBReader(const std::string &fname, bool lazyRead=false)
Definition: FPBReader.h:75
std::vector< std::pair< double, unsigned int > > getTanimotoNeighbors(boost::shared_array< boost::uint8_t > bv, double threshold=0.7, bool usePopcountScreen=true) const
Definition: FPBReader.h:175
std::pair< boost::shared_ptr< ExplicitBitVect >, std::string > operator[](unsigned int idx) const
returns the fingerprint and id of the requested fingerprint
Definition: FPBReader.h:133
Includes a bunch of functionality for handling Atom and Bond queries.
Definition: Atom.h:29
double getTversky(unsigned int idx, boost::shared_array< boost::uint8_t > bv, double ca, double cb) const
Definition: FPBReader.h:198
FPBReader(const char *fname, bool lazyRead=false)
ctor for reading from a named file
Definition: FPBReader.h:71
std::vector< unsigned int > getContainingNeighbors(boost::shared_array< boost::uint8_t > bv) const
Definition: FPBReader.h:241
a class for bit vectors that are densely occupied
double getTanimoto(unsigned int idx, boost::shared_array< boost::uint8_t > bv) const
Definition: FPBReader.h:152
void cleanup()
cleanup
Definition: FPBReader.h:120