RDKit
Open-source cheminformatics and machine learning.
Catalog.h
Go to the documentation of this file.
1 //
2 // Copyright (C) 2003-2006 Rational Discovery LLC
3 //
4 // @@ All Rights Reserved @@
5 // This file is part of the RDKit.
6 // The contents are covered by the terms of the BSD license
7 // which is included in the file license.txt, found at the root
8 // of the RDKit source tree.
9 //
10 
11 #ifndef __RD_CATALOG_H__
12 #define __RD_CATALOG_H__
13 
14 // Boost graph stuff
16 #include <boost/graph/graph_traits.hpp>
17 #include <boost/graph/adjacency_list.hpp>
18 #include <boost/version.hpp>
19 #if BOOST_VERSION >= 104000
20 #include <boost/property_map/property_map.hpp>
21 #else
22 #include <boost/property_map.hpp>
23 #endif
25 
26 // for some typedefs
27 #include <RDGeneral/types.h>
28 #include <RDGeneral/StreamOps.h>
29 
30 namespace RDCatalog {
31 const int versionMajor = 1;
32 const int versionMinor = 0;
33 const int versionPatch = 0;
34 const int endianId = 0xDEADBEEF;
35 
36 //-----------------------------------------------------------------------------
37 //! abstract base class for a catalog object
38 template <class entryType, class paramType>
39 class Catalog {
40  public:
41  typedef entryType entryType_t;
42  typedef paramType paramType_t;
43 
44  //------------------------------------
46 
47  //------------------------------------
48  virtual ~Catalog() { delete dp_cParams; }
49 
50  //------------------------------------
51  //! return a serialized form of the Catalog as an std::string
52  virtual std::string Serialize() const = 0;
53 
54  //------------------------------------
55  //! adds an entry to the catalog
56  /*!
57 
58  \param entry the entry to be added
59  \param updateFPLength (optional) if this is true, our internal
60  fingerprint length will also be updated.
61 
62  */
63  virtual unsigned int addEntry(entryType *entry,
64  bool updateFPLength = true) = 0;
65 
66  //------------------------------------
67  //! returns a particular entry in the Catalog
68  virtual const entryType *getEntryWithIdx(unsigned int idx) const = 0;
69 
70  //------------------------------------
71  //! returns the number of entries
72  virtual unsigned int getNumEntries() const = 0;
73 
74  //------------------------------------
75  //! returns the length of our fingerprint
76  unsigned int getFPLength() const { return d_fpLength; }
77 
78  //------------------------------------
79  //! sets our fingerprint length
80  void setFPLength(unsigned int val) { d_fpLength = val; }
81 
82  //------------------------------------
83  //! sets our parameters by copying the \c params argument
84  virtual void setCatalogParams(paramType *params) {
85  PRECONDITION(params, "bad parameter object");
86  // if we already have a paramter object throw an exception
88  "A parameter object already exists on the catalog");
89  /*
90  if (dp_cParams) {
91  // we already have parameter object on the catalog
92  // can't overwrite it
93  PRECONDITION(0, "A parameter object already exist on the catalog");
94  }*/
95  dp_cParams = new paramType(*params);
96  }
97 
98  //------------------------------------
99  //! returns a pointer to our parameters
100  const paramType *getCatalogParams() const { return dp_cParams; }
101 
102  protected:
103  // this is the ID that will be assigned to the next entry
104  // added to the catalog - need not be same as the number of entries
105  // in the catalog and does not correspond with the
106  // id of the entry in the catalog.
107  // this is more along the lines of bitId
108  unsigned int d_fpLength; //!< the length of our fingerprint
109  paramType *dp_cParams; //!< our params object
110 };
111 
112 //-----------------------------------------------------------------------------
113 //! A Catalog with a hierarchical structure
114 /*!
115 
116  The entries of a HierarchCatalog are arranged in a directed graph
117 
118  <b>The difference between <i>Indices</i> and <i>Bit Ids</i></b>
119 
120  A HierarchCatalog may contain more entries than the user is actually
121  interested in. For example a HierarchCatalog constructed to contain
122  orders 5 through 8 may well contain information about orders 1-5,
123  in order to facilitate some search optimizations.
124 
125  - <i>Bit Ids</i> refer to the "interesting" bits.
126  So, in the above example, Bit Id \c 0 will be the first entry
127  with order 5.
128  - <i>Indices</i> refer to the underlying structure of the catalog.
129  So, in the above example, the entry with index \c 0 will be
130  the first entry with order 1.
131 
132 */
133 template <class entryType, class paramType, class orderType>
134 class HierarchCatalog : public Catalog<entryType, paramType> {
135  // the entries in the catalog can be traversed using the edges
136  // in a desired order
137  public:
138  //! used by the BGL to set up the node properties in our graph
139  struct vertex_entry_t {
140  enum { num = 1003 };
141  typedef boost::vertex_property_tag kind;
142  };
143  typedef boost::property<vertex_entry_t, entryType *> EntryProperty;
144 
145  //! the type of the graph itself:
146  typedef boost::adjacency_list<
147  boost::vecS,
148  boost::vecS, // FIX: should be using setS for edges so that parallel
149  // edges are never added (page 225 BGL book)
150  // but that seems result in compile errors
151  boost::bidirectionalS, EntryProperty> CatalogGraph;
152 
153  typedef boost::graph_traits<CatalogGraph> CAT_GRAPH_TRAITS;
154  typedef typename CAT_GRAPH_TRAITS::vertex_iterator VER_ITER;
155  typedef std::pair<VER_ITER, VER_ITER> ENT_ITER_PAIR;
156  typedef typename CAT_GRAPH_TRAITS::adjacency_iterator DOWN_ENT_ITER;
157  typedef std::pair<DOWN_ENT_ITER, DOWN_ENT_ITER> DOWN_ENT_ITER_PAIR;
158 
159  //------------------------------------
161 
162  //------------------------------------
163  //! Construct by making a copy of the input \c params object
166  this->setCatalogParams(params);
167  }
168 
169  //------------------------------------
170  //! Construct from a \c pickle (a serialized form of the HierarchCatalog)
172  this->initFromString(pickle);
173  }
174 
175  //------------------------------------
176  ~HierarchCatalog() { destroy(); }
177 
178  //------------------------------------
179  //! serializes this object to a stream
180  void toStream(std::ostream &ss) const {
181  PRECONDITION(this->getCatalogParams(), "NULL parameter object");
182 
183  // the i/o header:
184  RDKit::streamWrite(ss, endianId);
185  RDKit::streamWrite(ss, versionMajor);
186  RDKit::streamWrite(ss, versionMinor);
187  RDKit::streamWrite(ss, versionPatch);
188 
189  // information about the catalog itself:
190  int tmpUInt;
191  tmpUInt = this->getFPLength();
192  RDKit::streamWrite(ss, tmpUInt);
193  tmpUInt = this->getNumEntries();
194  RDKit::streamWrite(ss, tmpUInt);
195 
196  // std::cout << ">>>>-------------------------------" << std::endl;
197  // std::cout << "\tlength: " << getFPLength() << " " << getNumEntries() <<
198  // std::endl;
199 
200  // add the params object:
201  this->getCatalogParams()->toStream(ss);
202  // std::cout << "\tparams: " << getCatalogParams()->getLowerFragLength();
203  // std::cout << " " << getCatalogParams()->getUpperFragLength();
204  // std::cout << " " << getCatalogParams()->getNumFuncGroups();
205  // std::cout << std::endl;
206 
207  // write the entries in order:
208  for (unsigned int i = 0; i < getNumEntries(); i++) {
209  this->getEntryWithIdx(i)->toStream(ss);
210  }
211 
212  // finally the adjacency list:
213  for (unsigned int i = 0; i < getNumEntries(); i++) {
214  RDKit::INT_VECT children = this->getDownEntryList(i);
215  tmpUInt = static_cast<unsigned int>(children.size());
216  RDKit::streamWrite(ss, tmpUInt);
217  for (RDKit::INT_VECT::const_iterator ivci = children.begin();
218  ivci != children.end(); ivci++) {
219  RDKit::streamWrite(ss, *ivci);
220  }
221  }
222  }
223 
224  //------------------------------------
225  //! serializes this object and returns the resulting \c pickle
226  std::string Serialize() const {
227  std::stringstream ss(std::ios_base::binary | std::ios_base::out |
228  std::ios_base::in);
229  this->toStream(ss);
230  return ss.str();
231  }
232 
233  //------------------------------------
234  //! fills the contents of this object from a stream containing a \c pickle
235  void initFromStream(std::istream &ss) {
236  int tmpInt;
237  // FIX: at the moment we ignore the header info:
238  RDKit::streamRead(ss, tmpInt);
239  RDKit::streamRead(ss, tmpInt);
240  RDKit::streamRead(ss, tmpInt);
241  RDKit::streamRead(ss, tmpInt);
242 
243  unsigned int tmpUInt;
244  RDKit::streamRead(ss, tmpUInt); // fp length
245  this->setFPLength(tmpUInt);
246 
247  unsigned int numEntries;
248  RDKit::streamRead(ss, numEntries);
249  // std::cout << "<<<-------------------------------" << std::endl;
250  // std::cout << "\tlength: " << getFPLength() << " " << numEntries <<
251  // std::endl;
252 
253  // grab the params:
254  paramType *params = new paramType();
255  params->initFromStream(ss);
256  this->setCatalogParams(params);
257 
258  // std::cout << "\tparams: " << getCatalogParams()->getLowerFragLength();
259  // std::cout << " " << getCatalogParams()->getUpperFragLength();
260  // std::cout << " " << getCatalogParams()->getNumFuncGroups();
261  // std::cout << std::endl;
262 
263  // now all of the entries:
264  for (unsigned int i = 0; i < numEntries; i++) {
265  entryType *entry = new entryType();
266  entry->initFromStream(ss);
267  this->addEntry(entry, false);
268  }
269 
270  // and, finally, the adjacency list:
271  for (unsigned int i = 0; i < numEntries; i++) {
272  unsigned int nNeighbors;
273  RDKit::streamRead(ss, nNeighbors);
274  for (unsigned int j = 0; j < nNeighbors; j++) {
275  RDKit::streamRead(ss, tmpInt);
276  this->addEdge(i, tmpInt);
277  }
278  }
279  }
280 
281  //------------------------------------
282  unsigned int getNumEntries() const { return static_cast<unsigned int>(boost::num_vertices(d_graph)); }
283 
284  //------------------------------------
285  //! fills the contents of this object from a string containing a \c pickle
286  void initFromString(const std::string &text) {
287  std::stringstream ss(std::ios_base::binary | std::ios_base::out |
288  std::ios_base::in);
289  // initialize the stream:
290  ss.write(text.c_str(), text.length());
291  // now start reading out values:
292  this->initFromStream(ss);
293  }
294 
295  //------------------------------------
296  //! add a new entry to the catalog
297  /*!
298 
299  \param entry the entry to be added
300  \param updateFPLength (optional) if this is true, our internal
301  fingerprint length will also be updated.
302 
303  */
304  unsigned int addEntry(entryType *entry, bool updateFPLength = true) {
305  PRECONDITION(entry, "bad arguments");
306  if (updateFPLength) {
307  unsigned int fpl = this->getFPLength();
308  entry->setBitId(fpl);
309  fpl++;
310  this->setFPLength(fpl);
311  }
312  unsigned int eid = static_cast<unsigned int>(boost::add_vertex(EntryProperty(entry), d_graph));
313  orderType etype = entry->getOrder();
314  // REVIEW: this initialization is not required: the STL map, in
315  // theory, will create a new object when operator[] is called
316  // for a new item
317  if (d_orderMap.find(etype) == d_orderMap.end()) {
318  RDKit::INT_VECT nets;
319  d_orderMap[etype] = nets;
320  }
321  d_orderMap[etype].push_back(eid);
322  return eid;
323  }
324 
325  //------------------------------------
326  //! adds an edge between two entries in the catalog
327  /*!
328  Since we are using a bidirectional graph - the order in
329  which the ids are supplied here makes a difference
330 
331  \param id1 index of the edge's beginning
332  \param id2 index of the edge's end
333 
334  */
335  void addEdge(unsigned int id1, unsigned int id2) {
336  unsigned int nents = getNumEntries();
337  URANGE_CHECK(id1, nents - 1);
338  URANGE_CHECK(id2, nents - 1);
339  // FIX: if we boost::setS for the edgeList BGL will
340  // do the checking for duplicity (parallel edges)
341  // But for reasons unknown setS results in compile
342  // errors while using adjacent_vertices.
343  typename CAT_GRAPH_TRAITS::edge_descriptor edge;
344  bool found;
345  boost::tie(edge, found) = boost::edge(boost::vertex(id1, d_graph),
346  boost::vertex(id2, d_graph), d_graph);
347  if (!found) {
348  boost::add_edge(id1, id2, d_graph);
349  }
350  }
351 
352  //------------------------------------
353  //! returns a pointer to our entry with a particular index
354  const entryType *getEntryWithIdx(unsigned int idx) const {
355  URANGE_CHECK(idx, getNumEntries() - 1);
356  int vd = static_cast<int>(boost::vertex(idx, d_graph));
357  typename boost::property_map<CatalogGraph, vertex_entry_t>::const_type
358  pMap = boost::get(vertex_entry_t(), d_graph);
359  return pMap[vd];
360  }
361 
362  //------------------------------------
363  //! returns a pointer to our entry with a particular bit ID
364  const entryType *getEntryWithBitId(unsigned int idx) const {
365  URANGE_CHECK(idx, this->getFPLength() - 1);
366  typename boost::property_map<CatalogGraph, vertex_entry_t>::const_type
367  pMap = boost::get(vertex_entry_t(), d_graph);
368  const entryType *res = NULL;
369  for (unsigned int i = idx; i < this->getNumEntries(); i++) {
370  const entryType *e = pMap[i];
371  if (e->getBitId() == static_cast<int>(idx)) {
372  res = e;
373  break;
374  }
375  }
376  return res;
377  }
378 
379  //------------------------------------
380  //! returns the index of the entry with a particular bit ID
381  int getIdOfEntryWithBitId(unsigned int idx) const {
382  URANGE_CHECK(idx, this->getFPLength() - 1);
383  typename boost::property_map<CatalogGraph, vertex_entry_t>::const_type
384  pMap = boost::get(vertex_entry_t(), d_graph);
385  int res = -1;
386  for (unsigned int i = idx; i < this->getNumEntries(); i++) {
387  const entryType *e = pMap[i];
388  if (static_cast<unsigned int>(e->getBitId()) == idx) {
389  res = i;
390  break;
391  }
392  }
393  return res;
394  }
395 
396  //------------------------------------
397  //! returns a list of the indices of entries below the one passed in
398  RDKit::INT_VECT getDownEntryList(unsigned int idx) const {
399  RDKit::INT_VECT res;
400  DOWN_ENT_ITER nbrIdx, endIdx;
401  boost::tie(nbrIdx, endIdx) = boost::adjacent_vertices(idx, d_graph);
402  while (nbrIdx != endIdx) {
403  res.push_back(static_cast<int>(*nbrIdx));
404  nbrIdx++;
405  }
406  // std::cout << res.size() << "\n";
407  return res;
408  }
409 
410  //------------------------------------
411  //! returns a list of the indices that have a particular order
412  const RDKit::INT_VECT &getEntriesOfOrder(orderType ord) {
413  return d_orderMap[ord];
414  }
415 
416  //------------------------------------
417  //! returns a list of the indices that have a particular order
418  /*!
419  \overload
420  */
421  const RDKit::INT_VECT &getEntriesOfOrder(orderType ord) const {
422  typename std::map<orderType, RDKit::INT_VECT>::const_iterator elem;
423  elem = d_orderMap.find(ord);
425  elem != d_orderMap.end(),
426  " catalog does not contain any entries of the order specified");
427  return elem->second;
428  }
429 
430  private:
431  // graphs that store the entries in the catalog in a hierachical manner
432  CatalogGraph d_graph;
433  // a map that maps the order type of entries in the catalog to
434  // a vector of vertex indices in the graphs above
435  // e.g. for a catalog with molecular fragments, the order of a fragment can
436  // simply be the number of bond in it. The list this oder maps to is all the
437  // vertex ids of these fragment in the catalog that have this many bonds in
438  // them
439  std::map<orderType, RDKit::INT_VECT> d_orderMap;
440 
441  //------------------------------------
442  //! clear any memory that we've used
443  void destroy() {
444  ENT_ITER_PAIR entItP = boost::vertices(d_graph);
445  typename boost::property_map<CatalogGraph, vertex_entry_t>::type pMap =
446  boost::get(vertex_entry_t(), d_graph);
447  while (entItP.first != entItP.second) {
448  delete pMap[*(entItP.first++)];
449  }
450  }
451 };
452 
453 //-----------------------------------------------------------------------------
454 //! a linear Catalog (analogous to an std::vector)
455 /*!
456  Here there is no particular hierarchy, simply a
457  collection of entries.
458 */
459 template <class entryType, class orderType>
460 class LinearCatalog : public Catalog<entryType, orderType> {
461  // here there is no particular hierarchy of entries
462  // we simply model it as a vector of entries
463  // FIX: for retrieval purposes a better model map be std::map
464 
465  public:
466  std::string Serialize();
467 
468  unsigned int addEntry(entryType *entry, bool updateFPLength = true);
469 
470  const entryType *getEntryWithIdx(unsigned int idx) const;
471 
472  private:
473  std::vector<entryType *> d_vector;
474 };
475 }
476 
477 #endif
virtual const entryType * getEntryWithIdx(unsigned int idx) const =0
returns a particular entry in the Catalog
const int versionMinor
Definition: Catalog.h:32
void initFromString(const std::string &text)
fills the contents of this object from a string containing a pickle
Definition: Catalog.h:286
CAT_GRAPH_TRAITS::adjacency_iterator DOWN_ENT_ITER
Definition: Catalog.h:156
used by the BGL to set up the node properties in our graph
Definition: Catalog.h:139
paramType paramType_t
Definition: Catalog.h:42
unsigned int getFPLength() const
returns the length of our fingerprint
Definition: Catalog.h:76
int getIdOfEntryWithBitId(unsigned int idx) const
returns the index of the entry with a particular bit ID
Definition: Catalog.h:381
virtual std::string Serialize() const =0
return a serialized form of the Catalog as an std::string
#define CHECK_INVARIANT(expr, mess)
Definition: Invariant.h:99
boost::vertex_property_tag kind
Definition: Catalog.h:141
virtual void setCatalogParams(paramType *params)
sets our parameters by copying the params argument
Definition: Catalog.h:84
virtual unsigned int addEntry(entryType *entry, bool updateFPLength=true)=0
adds an entry to the catalog
void streamRead(std::istream &ss, T &loc)
does a binary read of an object from a stream
Definition: StreamOps.h:226
boost::graph_traits< CatalogGraph > CAT_GRAPH_TRAITS
Definition: Catalog.h:153
unsigned int getNumEntries() const
returns the number of entries
Definition: Catalog.h:282
const int endianId
Definition: Catalog.h:34
virtual unsigned int getNumEntries() const =0
returns the number of entries
std::pair< DOWN_ENT_ITER, DOWN_ENT_ITER > DOWN_ENT_ITER_PAIR
Definition: Catalog.h:157
a linear Catalog (analogous to an std::vector)
Definition: Catalog.h:460
A Catalog with a hierarchical structure.
Definition: Catalog.h:134
std::pair< VER_ITER, VER_ITER > ENT_ITER_PAIR
Definition: Catalog.h:155
void initFromStream(std::istream &ss)
fills the contents of this object from a stream containing a pickle
Definition: Catalog.h:235
const entryType * getEntryWithIdx(unsigned int idx) const
returns a pointer to our entry with a particular index
Definition: Catalog.h:354
abstract base class for a catalog object
Definition: Catalog.h:39
void toStream(std::ostream &ss) const
serializes this object to a stream
Definition: Catalog.h:180
const paramType * getCatalogParams() const
returns a pointer to our parameters
Definition: Catalog.h:100
std::string Serialize() const
serializes this object and returns the resulting pickle
Definition: Catalog.h:226
std::vector< int > INT_VECT
Definition: types.h:188
CAT_GRAPH_TRAITS::vertex_iterator VER_ITER
Definition: Catalog.h:154
virtual ~Catalog()
Definition: Catalog.h:48
const int versionPatch
Definition: Catalog.h:33
boost::property< vertex_entry_t, entryType * > EntryProperty
Definition: Catalog.h:143
const entryType * getEntryWithBitId(unsigned int idx) const
returns a pointer to our entry with a particular bit ID
Definition: Catalog.h:364
#define URANGE_CHECK(x, hi)
Definition: Invariant.h:140
boost::adjacency_list< boost::vecS, boost::vecS, boost::bidirectionalS, EntryProperty > CatalogGraph
the type of the graph itself:
Definition: Catalog.h:151
const int versionMajor
Definition: Catalog.h:31
void streamWrite(std::ostream &ss, const T &val)
does a binary write of an object to a stream
Definition: StreamOps.h:220
#define PRECONDITION(expr, mess)
Definition: Invariant.h:107
unsigned int d_fpLength
the length of our fingerprint
Definition: Catalog.h:108
paramType * dp_cParams
our params object
Definition: Catalog.h:109
const RDKit::INT_VECT & getEntriesOfOrder(orderType ord)
returns a list of the indices that have a particular order
Definition: Catalog.h:412
std::ostream & toStream(std::ostream &)
void addEdge(unsigned int id1, unsigned int id2)
adds an edge between two entries in the catalog
Definition: Catalog.h:335
const RDKit::INT_VECT & getEntriesOfOrder(orderType ord) const
returns a list of the indices that have a particular order
Definition: Catalog.h:421
unsigned int addEntry(entryType *entry, bool updateFPLength=true)
add a new entry to the catalog
Definition: Catalog.h:304
void setFPLength(unsigned int val)
sets our fingerprint length
Definition: Catalog.h:80
void pickle(const boost::shared_ptr< EnumerationStrategyBase > &enumerator, std::ostream &ss)
pickles a EnumerationStrategy and adds the results to a stream ss
RDKit::INT_VECT getDownEntryList(unsigned int idx) const
returns a list of the indices of entries below the one passed in
Definition: Catalog.h:398
entryType entryType_t
Definition: Catalog.h:41