RDKit
Open-source cheminformatics and machine learning.
Loading...
Searching...
No Matches
SparseIntVect.h
Go to the documentation of this file.
1// $Id$
2//
3// Copyright (C) 2007-2008 Greg Landrum
4//
5// @@ All Rights Reserved @@
6// This file is part of the RDKit.
7// The contents are covered by the terms of the BSD license
8// which is included in the file license.txt, found at the root
9// of the RDKit source tree.
10//
11#include <RDGeneral/export.h>
12#ifndef __RD_SPARSE_INT_VECT_20070921__
13#define __RD_SPARSE_INT_VECT_20070921__
14
15#include <map>
16#include <string>
17#include <RDGeneral/Invariant.h>
18#include <sstream>
20#include <RDGeneral/StreamOps.h>
21#include <cstdint>
22
24 0x0001; //!< version number to use in pickles
25namespace RDKit {
26//! a class for efficiently storing sparse vectors of ints
27template <typename IndexType>
29 public:
30 typedef std::map<IndexType, int> StorageType;
31
32 SparseIntVect() : d_length(0) {}
33
34 //! initialize with a particular length
35 SparseIntVect(IndexType length) : d_length(length) {}
36
37 //! Copy constructor
39 d_length = other.d_length;
40 d_data.clear();
41 d_data.insert(other.d_data.begin(), other.d_data.end());
42 }
43
44 //! constructor from a pickle
45 SparseIntVect(const std::string &pkl) {
46 initFromText(pkl.c_str(), pkl.size());
47 }
48 //! constructor from a pickle
49 SparseIntVect(const char *pkl, const unsigned int len) {
50 initFromText(pkl, len);
51 }
52
54 if (this == &other) {
55 return *this;
56 }
57 d_length = other.d_length;
58 d_data.clear();
59 d_data.insert(other.d_data.begin(), other.d_data.end());
60 return *this;
61 }
62
63 //! destructor (doesn't need to do anything)
64 ~SparseIntVect() = default;
65
66#ifdef __clang__
67#pragma clang diagnostic push
68#pragma clang diagnostic ignored "-Wtautological-compare"
69#elif (defined(__GNUC__) || defined(__GNUG__)) && \
70 (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ > 1))
71#if (__GNUC__ > 4 || __GNUC_MINOR__ > 5)
72#pragma GCC diagnostic push
73#endif
74#pragma GCC diagnostic ignored "-Wtype-limits"
75#endif
76 //! return the value at an index
77 int getVal(IndexType idx) const {
78 if (idx < 0 || idx >= d_length) {
79 throw IndexErrorException(static_cast<int>(idx));
80 }
81 int res = 0;
82 typename StorageType::const_iterator iter = d_data.find(idx);
83 if (iter != d_data.end()) {
84 res = iter->second;
85 }
86 return res;
87 }
88
89 //! set the value at an index
90 void setVal(IndexType idx, int val) {
91 if (idx < 0 || idx >= d_length) {
92 throw IndexErrorException(static_cast<int>(idx));
93 }
94 if (val != 0) {
95 d_data[idx] = val;
96 } else {
97 d_data.erase(idx);
98 }
99 }
100#ifdef __clang__
101#pragma clang diagnostic pop
102#elif (defined(__GNUC__) || defined(__GNUG__)) && \
103 (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ > 5))
104#pragma GCC diagnostic pop
105#endif
106 //! support indexing using []
107 int operator[](IndexType idx) const { return getVal(idx); }
108
109 //! returns the length
110 IndexType getLength() const { return d_length; }
111
112 //! returns the sum of all the elements in the vect
113 //! the doAbs argument toggles summing the absolute values of the elements
114 int getTotalVal(bool doAbs = false) const {
115 int res = 0;
116 typename StorageType::const_iterator iter;
117 for (iter = d_data.begin(); iter != d_data.end(); ++iter) {
118 if (!doAbs) {
119 res += iter->second;
120 } else {
121 res += abs(iter->second);
122 }
123 }
124 return res;
125 }
126 //! returns the length
127 unsigned int size() const { return getLength(); }
128
129 //! returns our nonzero elements as a map(IndexType->int)
130 const StorageType &getNonzeroElements() const { return d_data; }
131
132 //! this is a "fuzzy" intesection, the final value
133 //! of each element is equal to the minimum from
134 //! the two vects.
136 if (other.d_length != d_length) {
137 throw ValueErrorException("SparseIntVect size mismatch");
138 }
139
140 typename StorageType::iterator iter = d_data.begin();
141 typename StorageType::const_iterator oIter = other.d_data.begin();
142 while (iter != d_data.end()) {
143 // we're relying on the fact that the maps are sorted:
144 while (oIter != other.d_data.end() && oIter->first < iter->first) {
145 ++oIter;
146 }
147 if (oIter != other.d_data.end() && oIter->first == iter->first) {
148 // found it:
149 if (oIter->second < iter->second) {
150 iter->second = oIter->second;
151 }
152 ++oIter;
153 ++iter;
154 } else {
155 // not there; our value is zero, which means
156 // we should remove this value:
157 typename StorageType::iterator tmpIter = iter;
158 ++tmpIter;
159 d_data.erase(iter);
160 iter = tmpIter;
161 }
162 }
163 return *this;
164 }
166 const SparseIntVect<IndexType> &other) const {
168 return res &= other;
169 }
170
171 //! this is a "fuzzy" union, the final value
172 //! of each element is equal to the maximum from
173 //! the two vects.
175 if (other.d_length != d_length) {
176 throw ValueErrorException("SparseIntVect size mismatch");
177 }
178
179 typename StorageType::iterator iter = d_data.begin();
180 typename StorageType::const_iterator oIter = other.d_data.begin();
181 while (iter != d_data.end()) {
182 // we're relying on the fact that the maps are sorted:
183 while (oIter != other.d_data.end() && oIter->first < iter->first) {
184 d_data[oIter->first] = oIter->second;
185 ++oIter;
186 }
187 if (oIter != other.d_data.end() && oIter->first == iter->first) {
188 // found it:
189 if (oIter->second > iter->second) {
190 iter->second = oIter->second;
191 }
192 ++oIter;
193 }
194 ++iter;
195 }
196 // finish up the other vect:
197 while (oIter != other.d_data.end()) {
198 d_data[oIter->first] = oIter->second;
199 ++oIter;
200 }
201 return *this;
202 }
204 const SparseIntVect<IndexType> &other) const {
206 return res |= other;
207 }
208
210 if (other.d_length != d_length) {
211 throw ValueErrorException("SparseIntVect size mismatch");
212 }
213 typename StorageType::iterator iter = d_data.begin();
214 typename StorageType::const_iterator oIter = other.d_data.begin();
215 while (oIter != other.d_data.end()) {
216 while (iter != d_data.end() && iter->first < oIter->first) {
217 ++iter;
218 }
219 if (iter != d_data.end() && oIter->first == iter->first) {
220 // found it:
221 iter->second += oIter->second;
222 if (!iter->second) {
223 typename StorageType::iterator tIter = iter;
224 ++tIter;
225 d_data.erase(iter);
226 iter = tIter;
227 } else {
228 ++iter;
229 }
230 } else {
231 d_data[oIter->first] = oIter->second;
232 }
233 ++oIter;
234 }
235 return *this;
236 }
238 const SparseIntVect<IndexType> &other) const {
240 return res += other;
241 }
242
244 if (other.d_length != d_length) {
245 throw ValueErrorException("SparseIntVect size mismatch");
246 }
247 typename StorageType::iterator iter = d_data.begin();
248 typename StorageType::const_iterator oIter = other.d_data.begin();
249 while (oIter != other.d_data.end()) {
250 while (iter != d_data.end() && iter->first < oIter->first) {
251 ++iter;
252 }
253 if (iter != d_data.end() && oIter->first == iter->first) {
254 // found it:
255 iter->second -= oIter->second;
256 if (!iter->second) {
257 typename StorageType::iterator tIter = iter;
258 ++tIter;
259 d_data.erase(iter);
260 iter = tIter;
261 } else {
262 ++iter;
263 }
264 } else {
265 d_data[oIter->first] = -oIter->second;
266 }
267 ++oIter;
268 }
269 return *this;
270 }
272 const SparseIntVect<IndexType> &other) const {
274 return res -= other;
275 }
277 typename StorageType::iterator iter = d_data.begin();
278 while (iter != d_data.end()) {
279 iter->second *= v;
280 ++iter;
281 }
282 return *this;
283 }
286 return res *= v;
287 }
289 typename StorageType::iterator iter = d_data.begin();
290 while (iter != d_data.end()) {
291 iter->second /= v;
292 ++iter;
293 }
294 return *this;
295 }
298 return res /= v;
299 }
301 typename StorageType::iterator iter = d_data.begin();
302 while (iter != d_data.end()) {
303 iter->second += v;
304 ++iter;
305 }
306 return *this;
307 }
310 return res += v;
311 }
313 typename StorageType::iterator iter = d_data.begin();
314 while (iter != d_data.end()) {
315 iter->second -= v;
316 ++iter;
317 }
318 return *this;
319 }
322 return res -= v;
323 }
324
325 bool operator==(const SparseIntVect<IndexType> &v2) const {
326 if (d_length != v2.d_length) {
327 return false;
328 }
329 return d_data == v2.d_data;
330 }
331 bool operator!=(const SparseIntVect<IndexType> &v2) const {
332 return !(*this == v2);
333 }
334
335 //! returns a binary string representation (pickle)
336 std::string toString() const {
337 std::stringstream ss(std::ios_base::binary | std::ios_base::out |
338 std::ios_base::in);
339 std::uint32_t tInt;
342 tInt = sizeof(IndexType);
344 streamWrite(ss, d_length);
345 IndexType nEntries = d_data.size();
347
348 typename StorageType::const_iterator iter = d_data.begin();
349 while (iter != d_data.end()) {
350 streamWrite(ss, iter->first);
351 std::int32_t tInt = iter->second;
353 ++iter;
354 }
355 return ss.str();
356 }
357
358 void fromString(const std::string &txt) {
359 initFromText(txt.c_str(), txt.length());
360 }
361
362 private:
363 IndexType d_length;
364 StorageType d_data;
365
366 void initFromText(const char *pkl, const unsigned int len) {
367 d_data.clear();
368 std::stringstream ss(std::ios_base::binary | std::ios_base::out |
369 std::ios_base::in);
370 ss.write(pkl, len);
371
372 std::uint32_t vers;
374 if (vers == 0x0001) {
375 std::uint32_t tInt;
377 if (tInt > sizeof(IndexType)) {
379 "IndexType cannot accommodate index size in SparseIntVect pickle");
380 }
381 switch (tInt) {
382 case sizeof(char):
383 readVals<unsigned char>(ss);
384 break;
385 case sizeof(std::int32_t):
386 readVals<std::uint32_t>(ss);
387 break;
388 case sizeof(boost::int64_t):
389 readVals<boost::uint64_t>(ss);
390 break;
391 default:
392 throw ValueErrorException("unreadable format");
393 }
394 } else {
395 throw ValueErrorException("bad version in SparseIntVect pickle");
396 }
397 }
398 template <typename T>
399 void readVals(std::stringstream &ss) {
400 PRECONDITION(sizeof(T) <= sizeof(IndexType), "invalid size");
401 T tVal;
403 d_length = tVal;
404 T nEntries;
406 for (T i = 0; i < nEntries; ++i) {
408 std::int32_t val;
409 streamRead(ss, val);
410 d_data[tVal] = val;
411 }
412 }
413};
414
415template <typename IndexType, typename SequenceType>
417 const SequenceType &seq) {
418 typename SequenceType::const_iterator seqIt;
419 for (seqIt = seq.begin(); seqIt != seq.end(); ++seqIt) {
420 // EFF: probably not the most efficient approach
421 IndexType idx = *seqIt;
422 vect.setVal(idx, vect.getVal(idx) + 1);
423 }
424}
425
426namespace {
427template <typename IndexType>
429 const SparseIntVect<IndexType> &v2, double &v1Sum,
430 double &v2Sum, double &andSum) {
431 if (v1.getLength() != v2.getLength()) {
432 throw ValueErrorException("SparseIntVect size mismatch");
433 }
434 v1Sum = v2Sum = andSum = 0.0;
435 // we're doing : (v1&v2).getTotalVal(), but w/o generating
436 // the other vector:
437 typename SparseIntVect<IndexType>::StorageType::const_iterator iter1, iter2;
438 iter1 = v1.getNonzeroElements().begin();
439 if (iter1 != v1.getNonzeroElements().end()) {
440 v1Sum += abs(iter1->second);
441 }
442 iter2 = v2.getNonzeroElements().begin();
443 if (iter2 != v2.getNonzeroElements().end()) {
444 v2Sum += abs(iter2->second);
445 }
446 while (iter1 != v1.getNonzeroElements().end()) {
447 while (iter2 != v2.getNonzeroElements().end() &&
448 iter2->first < iter1->first) {
449 ++iter2;
450 if (iter2 != v2.getNonzeroElements().end()) {
451 v2Sum += abs(iter2->second);
452 }
453 }
454 if (iter2 != v2.getNonzeroElements().end()) {
455 if (iter2->first == iter1->first) {
456 if (abs(iter2->second) < abs(iter1->second)) {
457 andSum += abs(iter2->second);
458 } else {
459 andSum += abs(iter1->second);
460 }
461 ++iter2;
462 if (iter2 != v2.getNonzeroElements().end()) {
463 v2Sum += abs(iter2->second);
464 }
465 }
466 ++iter1;
467 if (iter1 != v1.getNonzeroElements().end()) {
468 v1Sum += abs(iter1->second);
469 }
470 } else {
471 break;
472 }
473 }
474 if (iter1 != v1.getNonzeroElements().end()) {
475 ++iter1;
476 while (iter1 != v1.getNonzeroElements().end()) {
477 v1Sum += abs(iter1->second);
478 ++iter1;
479 }
480 }
481 if (iter2 != v2.getNonzeroElements().end()) {
482 ++iter2;
483 while (iter2 != v2.getNonzeroElements().end()) {
484 v2Sum += abs(iter2->second);
485 ++iter2;
486 }
487 }
488}
489} // namespace
490
491template <typename IndexType>
493 const SparseIntVect<IndexType> &v2,
494 bool returnDistance = false, double bounds = 0.0) {
495 if (v1.getLength() != v2.getLength()) {
496 throw ValueErrorException("SparseIntVect size mismatch");
497 }
498 double v1Sum = 0.0;
499 double v2Sum = 0.0;
500 if (!returnDistance && bounds > 0.0) {
501 v1Sum = v1.getTotalVal(true);
502 v2Sum = v2.getTotalVal(true);
503 double denom = v1Sum + v2Sum;
504 if (fabs(denom) < 1e-6) {
505 // no need to worry about returnDistance here
506 return 0.0;
507 }
508 double minV = v1Sum < v2Sum ? v1Sum : v2Sum;
509 if (2. * minV / denom < bounds) {
510 return 0.0;
511 }
512 v1Sum = 0.0;
513 v2Sum = 0.0;
514 }
515
516 double numer = 0.0;
517
518 calcVectParams(v1, v2, v1Sum, v2Sum, numer);
519
520 double denom = v1Sum + v2Sum;
521 double sim;
522 if (fabs(denom) < 1e-6) {
523 sim = 0.0;
524 } else {
525 sim = 2. * numer / denom;
526 }
527 if (returnDistance) {
528 sim = 1. - sim;
529 }
530 // std::cerr<<" "<<v1Sum<<" "<<v2Sum<<" " << numer << " " << sim <<std::endl;
531 return sim;
532}
533
534template <typename IndexType>
536 const SparseIntVect<IndexType> &v2, double a, double b,
537 bool returnDistance = false, double bounds = 0.0) {
539 if (v1.getLength() != v2.getLength()) {
540 throw ValueErrorException("SparseIntVect size mismatch");
541 }
542 double v1Sum = 0.0;
543 double v2Sum = 0.0;
544 double andSum = 0.0;
545
546 calcVectParams(v1, v2, v1Sum, v2Sum, andSum);
547
548 double denom = a * v1Sum + b * v2Sum + (1 - a - b) * andSum;
549 double sim;
550
551 if (fabs(denom) < 1e-6) {
552 sim = 0.0;
553 } else {
554 sim = andSum / denom;
555 }
556 if (returnDistance) {
557 sim = 1. - sim;
558 }
559 // std::cerr<<" "<<v1Sum<<" "<<v2Sum<<" " << numer << " " << sim <<std::endl;
560 return sim;
561}
562
563template <typename IndexType>
565 const SparseIntVect<IndexType> &v2,
566 bool returnDistance = false, double bounds = 0.0) {
567 return TverskySimilarity(v1, v2, 1.0, 1.0, returnDistance, bounds);
568}
569} // namespace RDKit
570
571#endif
#define RDUNUSED_PARAM(x)
Definition Invariant.h:196
#define PRECONDITION(expr, mess)
Definition Invariant.h:109
const int ci_SPARSEINTVECT_VERSION
version number to use in pickles
Class to allow us to throw an IndexError from C++ and have it make it back to Python.
Definition Exceptions.h:20
a class for efficiently storing sparse vectors of ints
const StorageType & getNonzeroElements() const
returns our nonzero elements as a map(IndexType->int)
SparseIntVect< IndexType > & operator&=(const SparseIntVect< IndexType > &other)
SparseIntVect(IndexType length)
initialize with a particular length
SparseIntVect< IndexType > & operator-=(int v)
unsigned int size() const
returns the length
SparseIntVect< IndexType > & operator+(int v)
SparseIntVect< IndexType > & operator-(int v)
SparseIntVect< IndexType > & operator/(int v)
bool operator==(const SparseIntVect< IndexType > &v2) const
SparseIntVect(const SparseIntVect< IndexType > &other)
Copy constructor.
~SparseIntVect()=default
destructor (doesn't need to do anything)
const SparseIntVect< IndexType > operator&(const SparseIntVect< IndexType > &other) const
SparseIntVect< IndexType > & operator+=(int v)
SparseIntVect(const char *pkl, const unsigned int len)
constructor from a pickle
int operator[](IndexType idx) const
support indexing using []
void fromString(const std::string &txt)
void setVal(IndexType idx, int val)
set the value at an index
SparseIntVect< IndexType > & operator|=(const SparseIntVect< IndexType > &other)
std::string toString() const
returns a binary string representation (pickle)
int getTotalVal(bool doAbs=false) const
SparseIntVect< IndexType > & operator-=(const SparseIntVect< IndexType > &other)
const SparseIntVect< IndexType > operator+(const SparseIntVect< IndexType > &other) const
std::map< IndexType, int > StorageType
SparseIntVect & operator=(const SparseIntVect< IndexType > &other)
bool operator!=(const SparseIntVect< IndexType > &v2) const
SparseIntVect< IndexType > & operator*=(int v)
SparseIntVect< IndexType > & operator+=(const SparseIntVect< IndexType > &other)
SparseIntVect< IndexType > & operator*(int v)
SparseIntVect(const std::string &pkl)
constructor from a pickle
SparseIntVect< IndexType > & operator/=(int v)
const SparseIntVect< IndexType > operator-(const SparseIntVect< IndexType > &other) const
IndexType getLength() const
returns the length
int getVal(IndexType idx) const
return the value at an index
const SparseIntVect< IndexType > operator|(const SparseIntVect< IndexType > &other) const
Class to allow us to throw a ValueError from C++ and have it make it back to Python.
Definition Exceptions.h:40
Std stuff.
bool rdvalue_is(const RDValue_cast_t)
double TverskySimilarity(const SparseIntVect< IndexType > &v1, const SparseIntVect< IndexType > &v2, double a, double b, bool returnDistance=false, double bounds=0.0)
void updateFromSequence(SparseIntVect< IndexType > &vect, const SequenceType &seq)
double TanimotoSimilarity(const SparseIntVect< IndexType > &v1, const SparseIntVect< IndexType > &v2, bool returnDistance=false, double bounds=0.0)
double DiceSimilarity(const SparseIntVect< IndexType > &v1, const SparseIntVect< IndexType > &v2, bool returnDistance=false, double bounds=0.0)
void streamRead(std::istream &ss, T &loc)
does a binary read of an object from a stream
Definition StreamOps.h:283
void streamWrite(std::ostream &ss, const T &val)
does a binary write of an object to a stream
Definition StreamOps.h:261
Definition RDLog.h:25