RDKit
Open-source cheminformatics and machine learning.
SparseIntVect.h
Go to the documentation of this file.
1 // $Id$
2 //
3 // Copyright (C) 2007-2008 Greg Landrum
4 //
5 // @@ All Rights Reserved @@
6 // This file is part of the RDKit.
7 // The contents are covered by the terms of the BSD license
8 // which is included in the file license.txt, found at the root
9 // of the RDKit source tree.
10 //
11 #ifndef __RD_SPARSE_INT_VECT_20070921__
12 #define __RD_SPARSE_INT_VECT_20070921__
13 
14 #include <map>
15 #include <string>
16 #include <RDGeneral/Invariant.h>
17 #include <sstream>
18 #include <RDGeneral/Exceptions.h>
19 #include <RDGeneral/StreamOps.h>
20 #include <boost/cstdint.hpp>
21 
23  0x0001; //!< version number to use in pickles
24 namespace RDKit {
25 //! a class for efficiently storing sparse vectors of ints
26 template <typename IndexType>
28  public:
29  typedef std::map<IndexType, int> StorageType;
30 
31  SparseIntVect() : d_length(0){};
32 
33  //! initialize with a particular length
34  SparseIntVect(IndexType length) : d_length(length){};
35 
36  //! Copy constructor
38  d_length = other.d_length;
39  d_data.insert(other.d_data.begin(), other.d_data.end());
40  }
41 
42  //! constructor from a pickle
43  SparseIntVect(const std::string &pkl) {
44  initFromText(pkl.c_str(), pkl.size());
45  };
46  //! constructor from a pickle
47  SparseIntVect(const char *pkl, const unsigned int len) {
48  initFromText(pkl, len);
49  };
50 
51  //! destructor (doesn't need to do anything)
53 
54 #ifdef __clang__
55 #pragma clang diagnostic push
56 #pragma clang diagnostic ignored "-Wtautological-compare"
57 #elif (defined(__GNUC__) || defined(__GNUG__)) && \
58  (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ > 1))
59 #if (__GNUC__ > 4 || __GNUC_MINOR__ > 5)
60 #pragma GCC diagnostic push
61 #endif
62 #pragma GCC diagnostic ignored "-Wtype-limits"
63 #endif
64  //! return the value at an index
65  int getVal(IndexType idx) const {
66  if (idx < 0 || idx >= d_length) {
67  throw IndexErrorException(static_cast<int>(idx));
68  }
69  int res = 0;
70  typename StorageType::const_iterator iter = d_data.find(idx);
71  if (iter != d_data.end()) {
72  res = iter->second;
73  }
74  return res;
75  };
76 
77  //! set the value at an index
78  void setVal(IndexType idx, int val) {
79  if (idx < 0 || idx >= d_length) {
80  throw IndexErrorException(static_cast<int>(idx));
81  }
82  if (val != 0) {
83  d_data[idx] = val;
84  } else {
85  d_data.erase(idx);
86  }
87  };
88 #ifdef __clang__
89 #pragma clang diagnostic pop
90 #elif (defined(__GNUC__) || defined(__GNUG__)) && \
91  (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ > 5))
92 #pragma GCC diagnostic pop
93 #endif
94  //! support indexing using []
95  int operator[](IndexType idx) const { return getVal(idx); };
96 
97  //! returns the length
98  IndexType getLength() const { return d_length; };
99 
100  //! returns the sum of all the elements in the vect
101  //! the doAbs argument toggles summing the absolute values of the elements
102  int getTotalVal(bool doAbs = false) const {
103  int res = 0;
104  typename StorageType::const_iterator iter;
105  for (iter = d_data.begin(); iter != d_data.end(); ++iter) {
106  if (!doAbs)
107  res += iter->second;
108  else
109  res += abs(iter->second);
110  }
111  return res;
112  };
113  //! returns the length
114  unsigned int size() const { return getLength(); };
115 
116  //! returns our nonzero elements as a map(IndexType->int)
117  const StorageType &getNonzeroElements() const { return d_data; }
118 
119  //! this is a "fuzzy" intesection, the final value
120  //! of each element is equal to the minimum from
121  //! the two vects.
123  if (other.d_length != d_length) {
124  throw ValueErrorException("SparseIntVect size mismatch");
125  }
126 
127  typename StorageType::iterator iter = d_data.begin();
128  typename StorageType::const_iterator oIter = other.d_data.begin();
129  while (iter != d_data.end()) {
130  // we're relying on the fact that the maps are sorted:
131  while (oIter != other.d_data.end() && oIter->first < iter->first) {
132  ++oIter;
133  }
134  if (oIter != other.d_data.end() && oIter->first == iter->first) {
135  // found it:
136  if (oIter->second < iter->second) {
137  iter->second = oIter->second;
138  }
139  ++oIter;
140  ++iter;
141  } else {
142  // not there; our value is zero, which means
143  // we should remove this value:
144  typename StorageType::iterator tmpIter = iter;
145  ++tmpIter;
146  d_data.erase(iter);
147  iter = tmpIter;
148  }
149  }
150  return *this;
151  };
153  const SparseIntVect<IndexType> &other) const {
154  SparseIntVect<IndexType> res(*this);
155  return res &= other;
156  }
157 
158  //! this is a "fuzzy" union, the final value
159  //! of each element is equal to the maximum from
160  //! the two vects.
162  if (other.d_length != d_length) {
163  throw ValueErrorException("SparseIntVect size mismatch");
164  }
165 
166  typename StorageType::iterator iter = d_data.begin();
167  typename StorageType::const_iterator oIter = other.d_data.begin();
168  while (iter != d_data.end()) {
169  // we're relying on the fact that the maps are sorted:
170  while (oIter != other.d_data.end() && oIter->first < iter->first) {
171  d_data[oIter->first] = oIter->second;
172  ++oIter;
173  }
174  if (oIter != other.d_data.end() && oIter->first == iter->first) {
175  // found it:
176  if (oIter->second > iter->second) {
177  iter->second = oIter->second;
178  }
179  ++oIter;
180  }
181  ++iter;
182  }
183  // finish up the other vect:
184  while (oIter != other.d_data.end()) {
185  d_data[oIter->first] = oIter->second;
186  ++oIter;
187  }
188  return *this;
189  };
191  const SparseIntVect<IndexType> &other) const {
192  SparseIntVect<IndexType> res(*this);
193  return res |= other;
194  }
195 
197  if (other.d_length != d_length) {
198  throw ValueErrorException("SparseIntVect size mismatch");
199  }
200  typename StorageType::iterator iter = d_data.begin();
201  typename StorageType::const_iterator oIter = other.d_data.begin();
202  while (oIter != other.d_data.end()) {
203  while (iter != d_data.end() && iter->first < oIter->first) {
204  ++iter;
205  }
206  if (iter != d_data.end() && oIter->first == iter->first) {
207  // found it:
208  iter->second += oIter->second;
209  if (!iter->second) {
210  typename StorageType::iterator tIter = iter;
211  ++tIter;
212  d_data.erase(iter);
213  iter = tIter;
214  } else {
215  ++iter;
216  }
217  } else {
218  d_data[oIter->first] = oIter->second;
219  }
220  ++oIter;
221  }
222  return *this;
223  };
225  const SparseIntVect<IndexType> &other) const {
226  SparseIntVect<IndexType> res(*this);
227  return res += other;
228  }
229 
231  if (other.d_length != d_length) {
232  throw ValueErrorException("SparseIntVect size mismatch");
233  }
234  typename StorageType::iterator iter = d_data.begin();
235  typename StorageType::const_iterator oIter = other.d_data.begin();
236  while (oIter != other.d_data.end()) {
237  while (iter != d_data.end() && iter->first < oIter->first) {
238  ++iter;
239  }
240  if (iter != d_data.end() && oIter->first == iter->first) {
241  // found it:
242  iter->second -= oIter->second;
243  if (!iter->second) {
244  typename StorageType::iterator tIter = iter;
245  ++tIter;
246  d_data.erase(iter);
247  iter = tIter;
248  } else {
249  ++iter;
250  }
251  } else {
252  d_data[oIter->first] = -oIter->second;
253  }
254  ++oIter;
255  }
256  return *this;
257  };
259  const SparseIntVect<IndexType> &other) const {
260  SparseIntVect<IndexType> res(*this);
261  return res -= other;
262  }
264  typename StorageType::iterator iter = d_data.begin();
265  while (iter != d_data.end()) {
266  iter->second *= v;
267  ++iter;
268  }
269  return *this;
270  };
272  SparseIntVect<IndexType> res(*this);
273  return res *= v;
274  };
276  typename StorageType::iterator iter = d_data.begin();
277  while (iter != d_data.end()) {
278  iter->second /= v;
279  ++iter;
280  }
281  return *this;
282  };
284  SparseIntVect<IndexType> res(*this);
285  return res /= v;
286  };
288  typename StorageType::iterator iter = d_data.begin();
289  while (iter != d_data.end()) {
290  iter->second += v;
291  ++iter;
292  }
293  return *this;
294  };
296  SparseIntVect<IndexType> res(*this);
297  return res += v;
298  };
300  typename StorageType::iterator iter = d_data.begin();
301  while (iter != d_data.end()) {
302  iter->second -= v;
303  ++iter;
304  }
305  return *this;
306  };
308  SparseIntVect<IndexType> res(*this);
309  return res -= v;
310  };
311 
312  bool operator==(const SparseIntVect<IndexType> &v2) const {
313  if (d_length != v2.d_length) {
314  return false;
315  }
316  return d_data == v2.d_data;
317  }
318  bool operator!=(const SparseIntVect<IndexType> &v2) const {
319  return !(*this == v2);
320  }
321 
322  //! returns a binary string representation (pickle)
323  std::string toString() const {
324  std::stringstream ss(std::ios_base::binary | std::ios_base::out |
325  std::ios_base::in);
326  boost::uint32_t tInt;
328  streamWrite(ss, tInt);
329  tInt = sizeof(IndexType);
330  streamWrite(ss, tInt);
331  streamWrite(ss, d_length);
332  IndexType nEntries = d_data.size();
333  streamWrite(ss, nEntries);
334 
335  typename StorageType::const_iterator iter = d_data.begin();
336  while (iter != d_data.end()) {
337  streamWrite(ss, iter->first);
338  boost::int32_t tInt = iter->second;
339  streamWrite(ss, tInt);
340  ++iter;
341  }
342  return ss.str();
343  };
344 
345  void fromString(const std::string &txt) {
346  initFromText(txt.c_str(), txt.length());
347  }
348 
349  private:
350  IndexType d_length;
351  StorageType d_data;
352 
353  void initFromText(const char *pkl, const unsigned int len) {
354  d_data.clear();
355  std::stringstream ss(std::ios_base::binary | std::ios_base::out |
356  std::ios_base::in);
357  ss.write(pkl, len);
358 
359  boost::uint32_t vers;
360  streamRead(ss, vers);
361  if (vers == 0x0001) {
362  boost::uint32_t tInt;
363  streamRead(ss, tInt);
364  if (tInt > sizeof(IndexType)) {
365  throw ValueErrorException(
366  "IndexType cannot accomodate index size in SparseIntVect pickle");
367  }
368  switch (tInt) {
369  case sizeof(char):
370  readVals<unsigned char>(ss);
371  break;
372  case sizeof(boost::int32_t):
373  readVals<boost::uint32_t>(ss);
374  break;
375  case sizeof(boost::int64_t):
376  readVals<boost::uint64_t>(ss);
377  break;
378  default:
379  throw ValueErrorException("unreadable format");
380  }
381  } else {
382  throw ValueErrorException("bad version in SparseIntVect pickle");
383  }
384  };
385  template <typename T>
386  void readVals(std::stringstream &ss) {
387  PRECONDITION(sizeof(T) <= sizeof(IndexType), "invalid size");
388  T tVal;
389  streamRead(ss, tVal);
390  d_length = tVal;
391  T nEntries;
392  streamRead(ss, nEntries);
393  for (T i = 0; i < nEntries; ++i) {
394  streamRead(ss, tVal);
395  boost::int32_t val;
396  streamRead(ss, val);
397  d_data[tVal] = val;
398  }
399  }
400 };
401 
402 template <typename IndexType, typename SequenceType>
404  const SequenceType &seq) {
405  typename SequenceType::const_iterator seqIt;
406  for (seqIt = seq.begin(); seqIt != seq.end(); ++seqIt) {
407  // EFF: probably not the most efficient approach
408  IndexType idx = *seqIt;
409  vect.setVal(idx, vect.getVal(idx) + 1);
410  }
411 }
412 
413 namespace {
414 template <typename IndexType>
415 void calcVectParams(const SparseIntVect<IndexType> &v1,
416  const SparseIntVect<IndexType> &v2, double &v1Sum,
417  double &v2Sum, double &andSum) {
418  if (v1.getLength() != v2.getLength()) {
419  throw ValueErrorException("SparseIntVect size mismatch");
420  }
421  v1Sum = v2Sum = andSum = 0.0;
422  // we're doing : (v1&v2).getTotalVal(), but w/o generating
423  // the other vector:
425  iter1 = v1.getNonzeroElements().begin();
426  if (iter1 != v1.getNonzeroElements().end()) v1Sum += abs(iter1->second);
427  iter2 = v2.getNonzeroElements().begin();
428  if (iter2 != v2.getNonzeroElements().end()) v2Sum += abs(iter2->second);
429  while (iter1 != v1.getNonzeroElements().end()) {
430  while (iter2 != v2.getNonzeroElements().end() &&
431  iter2->first < iter1->first) {
432  ++iter2;
433  if (iter2 != v2.getNonzeroElements().end()) v2Sum += abs(iter2->second);
434  }
435  if (iter2 != v2.getNonzeroElements().end()) {
436  if (iter2->first == iter1->first) {
437  if (abs(iter2->second) < abs(iter1->second)) {
438  andSum += abs(iter2->second);
439  } else {
440  andSum += abs(iter1->second);
441  }
442  ++iter2;
443  if (iter2 != v2.getNonzeroElements().end()) v2Sum += abs(iter2->second);
444  }
445  ++iter1;
446  if (iter1 != v1.getNonzeroElements().end()) v1Sum += abs(iter1->second);
447  } else {
448  break;
449  }
450  }
451  if (iter1 != v1.getNonzeroElements().end()) {
452  ++iter1;
453  while (iter1 != v1.getNonzeroElements().end()) {
454  v1Sum += abs(iter1->second);
455  ++iter1;
456  }
457  }
458  if (iter2 != v2.getNonzeroElements().end()) {
459  ++iter2;
460  while (iter2 != v2.getNonzeroElements().end()) {
461  v2Sum += abs(iter2->second);
462  ++iter2;
463  }
464  }
465 }
466 }
467 
468 template <typename IndexType>
470  const SparseIntVect<IndexType> &v2,
471  bool returnDistance = false, double bounds = 0.0) {
472  if (v1.getLength() != v2.getLength()) {
473  throw ValueErrorException("SparseIntVect size mismatch");
474  }
475  double v1Sum = 0.0;
476  double v2Sum = 0.0;
477  if (!returnDistance && bounds > 0.0) {
478  v1Sum = v1.getTotalVal(true);
479  v2Sum = v2.getTotalVal(true);
480  double denom = v1Sum + v2Sum;
481  if (fabs(denom) < 1e-6) {
482  if (returnDistance) {
483  return 1.0;
484  } else {
485  return 0.0;
486  }
487  }
488  double minV = v1Sum < v2Sum ? v1Sum : v2Sum;
489  if (2. * minV / denom < bounds) {
490  return 0.0;
491  }
492  v1Sum = 0.0;
493  v2Sum = 0.0;
494  }
495 
496  double numer = 0.0;
497 
498  calcVectParams(v1, v2, v1Sum, v2Sum, numer);
499 
500  double denom = v1Sum + v2Sum;
501  double sim;
502  if (fabs(denom) < 1e-6) {
503  sim = 0.0;
504  } else {
505  sim = 2. * numer / denom;
506  }
507  if (returnDistance) sim = 1. - sim;
508  // std::cerr<<" "<<v1Sum<<" "<<v2Sum<<" " << numer << " " << sim <<std::endl;
509  return sim;
510 }
511 
512 template <typename IndexType>
514  const SparseIntVect<IndexType> &v2, double a, double b,
515  bool returnDistance = false, double bounds = 0.0) {
516  RDUNUSED_PARAM(bounds);
517  if (v1.getLength() != v2.getLength()) {
518  throw ValueErrorException("SparseIntVect size mismatch");
519  }
520  double v1Sum = 0.0;
521  double v2Sum = 0.0;
522  double andSum = 0.0;
523 
524  calcVectParams(v1, v2, v1Sum, v2Sum, andSum);
525 
526  double denom = a * v1Sum + b * v2Sum + (1 - a - b) * andSum;
527  double sim;
528 
529  if (fabs(denom) < 1e-6) {
530  sim = 0.0;
531  } else {
532  sim = andSum / denom;
533  }
534  if (returnDistance) sim = 1. - sim;
535  // std::cerr<<" "<<v1Sum<<" "<<v2Sum<<" " << numer << " " << sim <<std::endl;
536  return sim;
537 }
538 
539 template <typename IndexType>
541  const SparseIntVect<IndexType> &v2,
542  bool returnDistance = false, double bounds = 0.0) {
543  return TverskySimilarity(v1, v2, 1.0, 1.0, returnDistance, bounds);
544 }
545 }
546 
547 #endif
std::string toString() const
returns a binary string representation (pickle)
double DiceSimilarity(const SparseIntVect< IndexType > &v1, const SparseIntVect< IndexType > &v2, bool returnDistance=false, double bounds=0.0)
SparseIntVect(const char *pkl, const unsigned int len)
constructor from a pickle
Definition: SparseIntVect.h:47
void updateFromSequence(SparseIntVect< IndexType > &vect, const SequenceType &seq)
const SparseIntVect< IndexType > operator+(const SparseIntVect< IndexType > &other) const
const int ci_SPARSEINTVECT_VERSION
version number to use in pickles
Definition: SparseIntVect.h:22
void streamRead(std::istream &ss, T &loc)
does a binary read of an object from a stream
Definition: StreamOps.h:236
SparseIntVect< IndexType > & operator&=(const SparseIntVect< IndexType > &other)
~SparseIntVect()
destructor (doesn&#39;t need to do anything)
Definition: SparseIntVect.h:52
int getVal(IndexType idx) const
return the value at an index
Definition: SparseIntVect.h:65
std::map< IndexType, int > StorageType
Definition: SparseIntVect.h:29
const SparseIntVect< IndexType > operator&(const SparseIntVect< IndexType > &other) const
SparseIntVect< IndexType > & operator*(int v)
SparseIntVect(const SparseIntVect< IndexType > &other)
Copy constructor.
Definition: SparseIntVect.h:37
unsigned int size() const
returns the length
double TanimotoSimilarity(const SparseIntVect< IndexType > &v1, const SparseIntVect< IndexType > &v2, bool returnDistance=false, double bounds=0.0)
const SparseIntVect< IndexType > operator-(const SparseIntVect< IndexType > &other) const
bool operator!=(const SparseIntVect< IndexType > &v2) const
SparseIntVect< IndexType > & operator/=(int v)
SparseIntVect< IndexType > & operator/(int v)
Includes a bunch of functionality for handling Atom and Bond queries.
Definition: Atom.h:29
bool operator==(const SparseIntVect< IndexType > &v2) const
SparseIntVect< IndexType > & operator-=(const SparseIntVect< IndexType > &other)
Class to allow us to throw an IndexError from C++ and have it make it back to Python.
Definition: Exceptions.h:18
const StorageType & getNonzeroElements() const
returns our nonzero elements as a map(IndexType->int)
#define RDUNUSED_PARAM(x)
Definition: Invariant.h:194
SparseIntVect(IndexType length)
initialize with a particular length
Definition: SparseIntVect.h:34
SparseIntVect< IndexType > & operator*=(int v)
const SparseIntVect< IndexType > operator|(const SparseIntVect< IndexType > &other) const
double TverskySimilarity(const SparseIntVect< IndexType > &v1, const SparseIntVect< IndexType > &v2, double a, double b, bool returnDistance=false, double bounds=0.0)
int getTotalVal(bool doAbs=false) const
void streamWrite(std::ostream &ss, const T &val)
does a binary write of an object to a stream
Definition: StreamOps.h:222
#define PRECONDITION(expr, mess)
Definition: Invariant.h:107
SparseIntVect(const std::string &pkl)
constructor from a pickle
Definition: SparseIntVect.h:43
a class for efficiently storing sparse vectors of ints
Definition: SparseIntVect.h:27
void setVal(IndexType idx, int val)
set the value at an index
Definition: SparseIntVect.h:78
SparseIntVect< IndexType > & operator+=(const SparseIntVect< IndexType > &other)
Class to allow us to throw a ValueError from C++ and have it make it back to Python.
Definition: Exceptions.h:32
SparseIntVect< IndexType > & operator-=(int v)
SparseIntVect< IndexType > & operator|=(const SparseIntVect< IndexType > &other)
IndexType getLength() const
returns the length
Definition: SparseIntVect.h:98
SparseIntVect< IndexType > & operator+=(int v)
SparseIntVect< IndexType > & operator+(int v)
int operator[](IndexType idx) const
support indexing using []
Definition: SparseIntVect.h:95
void fromString(const std::string &txt)
SparseIntVect< IndexType > & operator-(int v)