RDKit
Open-source cheminformatics and machine learning.
BitOps.h
Go to the documentation of this file.
1 //
2 // Copyright (C) 2003-2012 greg Landrum and Rational Discovery LLC
3 //
4 // @@ All Rights Reserved @@
5 // This file is part of the RDKit.
6 // The contents are covered by the terms of the BSD license
7 // which is included in the file license.txt, found at the root
8 // of the RDKit source tree.
9 //
10 #ifndef __RD_BITOPS_H__
11 #define __RD_BITOPS_H__
12 /*! \file BitOps.h
13 
14  \brief Contains general bit-comparison and similarity operations.
15 
16  The notation used to document the similarity metrics is:
17  - \c V1_n: number of bits in vector 1
18  - \c V1_o: number of on bits in vector 1
19  - <tt>(V1&V2)_o</tt>: number of on bits in the intersection of vectors 1 and
20  2
21 
22  */
23 
24 #include "BitVects.h"
25 #include <string>
26 
27 //! general purpose wrapper for calculating the similarity between two bvs
28 //! that may be of unequal size (will automatically fold as appropriate)
29 template <typename T>
30 double SimilarityWrapper(const T& bv1, const T& bv2,
31  double (*metric)(const T&, const T&),
32  bool returnDistance = false) {
33  double res = 0.0;
34  if (bv1.getNumBits() > bv2.getNumBits()) {
35  T* bv1tmp = FoldFingerprint(bv1, bv1.getNumBits() / bv2.getNumBits());
36  res = metric(*bv1tmp, bv2);
37  delete bv1tmp;
38  } else if (bv2.getNumBits() > bv1.getNumBits()) {
39  T* bv2tmp = FoldFingerprint(bv2, bv2.getNumBits() / bv1.getNumBits());
40  res = metric(bv1, *bv2tmp);
41  delete bv2tmp;
42  } else {
43  res = metric(bv1, bv2);
44  }
45  if (returnDistance) res = 1.0 - res;
46  return res;
47 }
48 //! \overload
49 template <typename T>
50 double SimilarityWrapper(const T& bv1, const T& bv2, double a, double b,
51  double (*metric)(const T&, const T&, double, double),
52  bool returnDistance = false) {
53  double res = 0.0;
54  if (bv1.getNumBits() > bv2.getNumBits()) {
55  T* bv1tmp = FoldFingerprint(bv1, bv1.getNumBits() / bv2.getNumBits());
56  res = metric(*bv1tmp, bv2, a, b);
57  delete bv1tmp;
58  } else if (bv2.getNumBits() > bv1.getNumBits()) {
59  T* bv2tmp = FoldFingerprint(bv2, bv2.getNumBits() / bv1.getNumBits());
60  res = metric(bv1, *bv2tmp, a, b);
61  delete bv2tmp;
62  } else {
63  res = metric(bv1, bv2, a, b);
64  }
65  if (returnDistance) res = 1.0 - res;
66  return res;
67 }
68 
69 bool AllProbeBitsMatch(const char* probe, const char* ref);
70 bool AllProbeBitsMatch(const std::string& probe, const std::string& ref);
71 bool AllProbeBitsMatch(const ExplicitBitVect& probe,
72  const ExplicitBitVect& ref);
73 
74 template <typename T1>
75 bool AllProbeBitsMatch(const T1& probe, const std::string& pkl);
76 
77 template <typename T1>
78 bool AllProbeBitsMatch(const T1& probe, const T1& ref);
79 
80 //! returns the number of on bits in common between two bit vectors
81 /*!
82  \return (bv1&bv2)_o
83 */
84 template <typename T1, typename T2>
85 int NumOnBitsInCommon(const T1& bv1, const T2& bv2);
86 
87 int NumOnBitsInCommon(const ExplicitBitVect& bv1, const ExplicitBitVect& bv2);
88 
89 //! returns the Tanimoto similarity between two bit vects
90 /*!
91  \return <tt>(bv1&bv2)_o / [bv1_o + bv2_o - (bv1&bv2)_o]</tt>
92 */
93 template <typename T1, typename T2>
94 double TanimotoSimilarity(const T1& bv1, const T2& bv2);
95 
96 //! returns the Cosine similarity between two bit vects
97 /*!
98  \return <tt>(bv1&bv2)_o / sqrt(bv1_o + bv2_o)</tt>
99 */
100 template <typename T1, typename T2>
101 double CosineSimilarity(const T1& bv1, const T2& bv2);
102 
103 //! returns the Kulczynski similarity between two bit vects
104 /*!
105  \return <tt>(bv1&bv2)_o * [bv1_o + bv2_o] / [2 * bv1_o * bv2_o]</tt>
106 */
107 template <typename T1, typename T2>
108 double KulczynskiSimilarity(const T1& bv1, const T2& bv2);
109 
110 //! returns the Dice similarity between two bit vects
111 /*!
112  \return <tt>2*(bv1&bv2)_o / [bv1_o + bv2_o]</tt>
113 */
114 template <typename T1, typename T2>
115 double DiceSimilarity(const T1& bv1, const T2& bv2);
116 
117 //! returns the Tversky similarity between two bit vects
118 /*!
119  \return <tt>(bv1&bv2)_o / [a*bv1_o + b*bv2_o + (1 - a - b)*(bv1&bv2)_o]</tt>
120 
121  Notes:
122  # 0 <= a,b <= 1
123  # Tversky(a=1,b=1) = Tanimoto
124  # Tversky(a=1/2,b=1/2) = Dice
125 
126 */
127 template <typename T1, typename T2>
128 double TverskySimilarity(const T1& bv1, const T2& bv2, double a, double b);
129 
130 //! returns the Sokal similarity between two bit vects
131 /*!
132  \return <tt>(bv1&bv2)_o / [2*bv1_o + 2*bv2_o - 3*(bv1&bv2)_o]</tt>
133 */
134 template <typename T1, typename T2>
135 double SokalSimilarity(const T1& bv1, const T2& bv2);
136 
137 //! returns the McConnaughey similarity between two bit vects
138 /*!
139  \return <tt>[(bv1&bv2)_o * (bv1_o + bv2_o) - (bv1_o * bv2_o)] / (bv1_o *
140  bv2_o)</tt>
141 */
142 template <typename T1, typename T2>
143 double McConnaugheySimilarity(const T1& bv1, const T2& bv2);
144 
145 //! returns the Asymmetric similarity between two bit vects
146 /*!
147  \return <tt>(bv1&bv2)_o / min(bv1_o,bv2_o)</tt>
148 */
149 template <typename T1, typename T2>
150 double AsymmetricSimilarity(const T1& bv1, const T2& bv2);
151 
152 //! returns the Braun-Blanquet similarity between two bit vects
153 /*!
154  \return <tt>(bv1&bv2)_o / max(bv1_o,bv2_o)</tt>
155 */
156 template <typename T1, typename T2>
157 double BraunBlanquetSimilarity(const T1& bv1, const T2& bv2);
158 
159 //! returns the Russel similarity between two bit vects
160 /*!
161  \return <tt>(bv1&bv2)_o / bv1_o</tt>
162 
163  <b>Note:</b> that this operation is non-commutative:
164  RusselSimilarity(bv1,bv2) != RusselSimilarity(bv2,bv1)
165 
166 */
167 template <typename T1, typename T2>
168 double RusselSimilarity(const T1& bv1, const T2& bv2);
169 
170 //! returns the Rogot-Goldberg similarity between two bit vects
171 /*!
172  \return <tt>(bv1&bv2)_o / (bv1_o + bv2_o)
173  + (bv1_n - bv1_o - bv2_o + (bv1&bv2)_o) / (2*bv1_n - bv1_o - bv2_o) </tt>
174 */
175 template <typename T1, typename T2>
176 double RogotGoldbergSimilarity(const T1& bv1, const T2& bv2);
177 
178 //! returns the on bit similarity between two bit vects
179 /*!
180  \return <tt>(bv1&bv2)_o / (bv1|bv2)_o </tt>
181 */
182 template <typename T1, typename T2>
183 double OnBitSimilarity(const T1& bv1, const T2& bv2);
184 
185 //! returns the number of common bits (on and off) between two bit vects
186 /*!
187  \return <tt>bv1_n - (bv1^bv2)_o</tt>
188 */
189 template <typename T1, typename T2>
190 int NumBitsInCommon(const T1& bv1, const T2& bv2);
191 
192 int NumBitsInCommon(const ExplicitBitVect& bv1, const ExplicitBitVect& bv2);
193 
194 //! returns the common-bit similarity (on and off) between two bit vects
195 //! This is also called Manhattan similarity.
196 /*!
197  \return <tt>[bv1_n - (bv1^bv2)_o] / bv1_n</tt>
198 */
199 template <typename T1, typename T2>
200 double AllBitSimilarity(const T1& bv1, const T2& bv2);
201 
202 //! returns an IntVect with indices of all on bits in common between two bit
203 // vects
204 template <typename T1, typename T2>
205 IntVect OnBitsInCommon(const T1& bv1, const T2& bv2);
206 
207 //! returns an IntVect with indices of all off bits in common between two bit
208 // vects
209 template <typename T1, typename T2>
210 IntVect OffBitsInCommon(const T1& bv1, const T2& bv2);
211 
212 //! returns the on-bit projected similarities between two bit vects
213 /*!
214  \return two values, as a DoubleVect:
215  - <tt>(bv1&bv2)_o / bv1_o</tt>
216  - <tt>(bv1&bv2)_o / bv2_o</tt>
217 */
218 template <typename T1, typename T2>
219 DoubleVect OnBitProjSimilarity(const T1& bv1, const T2& bv2);
220 
221 //! returns the on-bit projected similarities between two bit vects
222 /*!
223  \return two values, as a DoubleVect:
224  - <tt>[bv1_n - (bv1|bv2)_o] / [bv1_n - bv1_o]</tt>
225  - <tt>[bv2_n - (bv1|bv2)_o] / [bv2_n - bv2_o]</tt>
226 
227  <b>Note:</b> <tt>bv1_n = bv2_n</tt>
228 
229 */
230 template <typename T1, typename T2>
231 DoubleVect OffBitProjSimilarity(const T1& bv1, const T2& bv2);
232 
233 //! folds a bit vector \c factor times and returns the result
234 /*!
235  \param bv1 the vector to be folded
236  \param factor (optional) the number of times to fold it
237 
238  \return a pointer to the folded fingerprint, which is
239  <tt>bv1_n/factor</tt> long.
240 
241  <b>Note:</b> The caller is responsible for <tt>delete</tt>ing the result.
242  */
243 template <typename T1>
244 T1* FoldFingerprint(const T1& bv1, unsigned int factor = 2);
245 
246 //! returns a text representation of a bit vector (a string of 0s and 1s)
247 /*!
248  \param bv1 the vector to use
249 
250  \return an std::string
251 
252  */
253 template <typename T1>
254 std::string BitVectToText(const T1& bv1);
255 
256 //! returns a hex representation of a bit vector compatible with Andrew Dalke's
257 // FPS format
258 /*!
259  \param bv1 the vector to use
260 
261  \return an std::string
262 
263  */
264 template <typename T1>
265 std::string BitVectToFPSText(const T1& bv1);
266 
267 //! returns a binary string representation of a bit vector (an array of bytes)
268 /*!
269  \param bv1 the vector to use
270 
271  \return an std::string
272 
273  */
274 template <typename T1>
275 std::string BitVectToBinaryText(const T1& bv1);
276 
277 //! updates a bit vector from Andrew Dalke's FPS format
278 /*!
279  \param bv1 the vector to use
280  \param fps the FPS hex string
281 
282 
283  */
284 template <typename T1>
285 void UpdateBitVectFromFPSText(T1& bv1, const std::string& fps);
286 
287 //! updates a bit vector from a binary string representation of a bit vector (an
288 // array of bytes)
289 /*!
290  \param bv1 the vector to use
291  \param fps the binary string
292 
293 
294  */
295 template <typename T1>
296 void UpdateBitVectFromBinaryText(T1& bv1, const std::string& fps);
297 
298 // FIX: docs and tests please
299 
300 unsigned int CalcBitmapPopcount(const unsigned char* bv1, unsigned int nBytes);
301 
302 double CalcBitmapTanimoto(const unsigned char* bv1, const unsigned char* bv2,
303  unsigned int nBytes);
304 double CalcBitmapDice(const unsigned char* bv1, const unsigned char* bv2,
305  unsigned int nBytes);
306 double CalcBitmapTversky(const unsigned char* bv1, const unsigned char* bv2,
307  unsigned int nBytes, double ca, double cb);
308 bool CalcBitmapAllProbeBitsMatch(const unsigned char* probe,
309  const unsigned char* ref, unsigned int nBytes);
310 #endif
Pulls in all the BitVect classes.
double DiceSimilarity(const T1 &bv1, const T2 &bv2)
returns the Dice similarity between two bit vects
T1 * FoldFingerprint(const T1 &bv1, unsigned int factor=2)
folds a bit vector factor times and returns the result
std::string BitVectToFPSText(const T1 &bv1)
returns a hex representation of a bit vector compatible with Andrew Dalke&#39;s
double SokalSimilarity(const T1 &bv1, const T2 &bv2)
returns the Sokal similarity between two bit vects
DoubleVect OffBitProjSimilarity(const T1 &bv1, const T2 &bv2)
returns the on-bit projected similarities between two bit vects
double TanimotoSimilarity(const T1 &bv1, const T2 &bv2)
returns the Tanimoto similarity between two bit vects
double OnBitSimilarity(const T1 &bv1, const T2 &bv2)
returns the on bit similarity between two bit vects
DoubleVect OnBitProjSimilarity(const T1 &bv1, const T2 &bv2)
returns the on-bit projected similarities between two bit vects
double CalcBitmapTversky(const unsigned char *bv1, const unsigned char *bv2, unsigned int nBytes, double ca, double cb)
int NumOnBitsInCommon(const T1 &bv1, const T2 &bv2)
returns the number of on bits in common between two bit vectors
bool CalcBitmapAllProbeBitsMatch(const unsigned char *probe, const unsigned char *ref, unsigned int nBytes)
int NumBitsInCommon(const T1 &bv1, const T2 &bv2)
returns the number of common bits (on and off) between two bit vects
double RogotGoldbergSimilarity(const T1 &bv1, const T2 &bv2)
returns the Rogot-Goldberg similarity between two bit vects
void UpdateBitVectFromBinaryText(T1 &bv1, const std::string &fps)
updates a bit vector from a binary string representation of a bit vector (an
double AsymmetricSimilarity(const T1 &bv1, const T2 &bv2)
returns the Asymmetric similarity between two bit vects
std::string BitVectToBinaryText(const T1 &bv1)
returns a binary string representation of a bit vector (an array of bytes)
double CosineSimilarity(const T1 &bv1, const T2 &bv2)
returns the Cosine similarity between two bit vects
std::vector< double > DoubleVect
Definition: BitVect.h:18
double CalcBitmapDice(const unsigned char *bv1, const unsigned char *bv2, unsigned int nBytes)
double SimilarityWrapper(const T &bv1, const T &bv2, double(*metric)(const T &, const T &), bool returnDistance=false)
Definition: BitOps.h:30
IntVect OnBitsInCommon(const T1 &bv1, const T2 &bv2)
returns an IntVect with indices of all on bits in common between two bit
std::string BitVectToText(const T1 &bv1)
returns a text representation of a bit vector (a string of 0s and 1s)
double TverskySimilarity(const T1 &bv1, const T2 &bv2, double a, double b)
returns the Tversky similarity between two bit vects
double McConnaugheySimilarity(const T1 &bv1, const T2 &bv2)
returns the McConnaughey similarity between two bit vects
double AllBitSimilarity(const T1 &bv1, const T2 &bv2)
bool AllProbeBitsMatch(const char *probe, const char *ref)
std::vector< int > IntVect
Definition: BitVect.h:16
a class for bit vectors that are densely occupied
double CalcBitmapTanimoto(const unsigned char *bv1, const unsigned char *bv2, unsigned int nBytes)
IntVect OffBitsInCommon(const T1 &bv1, const T2 &bv2)
returns an IntVect with indices of all off bits in common between two bit
double RusselSimilarity(const T1 &bv1, const T2 &bv2)
returns the Russel similarity between two bit vects
double KulczynskiSimilarity(const T1 &bv1, const T2 &bv2)
returns the Kulczynski similarity between two bit vects
void UpdateBitVectFromFPSText(T1 &bv1, const std::string &fps)
updates a bit vector from Andrew Dalke&#39;s FPS format
double BraunBlanquetSimilarity(const T1 &bv1, const T2 &bv2)
returns the Braun-Blanquet similarity between two bit vects
unsigned int CalcBitmapPopcount(const unsigned char *bv1, unsigned int nBytes)