SparseIntVect.h
Go to the documentation of this file.00001
00002
00003
00004
00005
00006
00007 #ifndef __RD_SPARSE_INT_VECT_20070921__
00008 #define __RD_SPARSE_INT_VECT_20070921__
00009
00010 #include <map>
00011 #include <string>
00012 #include <RDGeneral/Invariant.h>
00013 #include <sstream>
00014 #include <RDBoost/Exceptions.h>
00015 #include <RDGeneral/StreamOps.h>
00016 #include <boost/cstdint.hpp>
00017
00018
00019 const int ci_SPARSEINTVECT_VERSION=0x0001;
00020 namespace RDKit{
00021
00022 template <typename IndexType>
00023 class SparseIntVect {
00024 public:
00025 typedef std::map<IndexType,int> StorageType;
00026
00027 SparseIntVect() : d_length(0) {};
00028
00029
00030 SparseIntVect(IndexType length) : d_length(length) {};
00031
00032
00033 SparseIntVect(const SparseIntVect<IndexType> &other){
00034 d_length=other.d_length;
00035 d_data.insert(other.d_data.begin(),other.d_data.end());
00036 }
00037
00038
00039 SparseIntVect(const std::string pkl){
00040 initFromText(pkl.c_str(),pkl.size());
00041 };
00042
00043 SparseIntVect(const char *pkl,const unsigned int len){
00044 initFromText(pkl,len);
00045 };
00046
00047
00048 ~SparseIntVect() {}
00049
00050
00051 int getVal(IndexType idx) const {
00052 if(idx<0||idx>=d_length){
00053 throw IndexErrorException(static_cast<int>(idx));
00054 }
00055 int res=0;
00056 typename StorageType::const_iterator iter=d_data.find(idx);
00057 if(iter!=d_data.end()){
00058 res=iter->second;
00059 }
00060 return res;
00061 };
00062 int operator[] (IndexType idx) const { return getVal(idx); };
00063
00064
00065 void setVal(IndexType idx, int val){
00066 if(idx<0||idx>=d_length){
00067 throw IndexErrorException(static_cast<int>(idx));
00068 }
00069 if(val!=0){
00070 d_data[idx]=val;
00071 } else {
00072 d_data.erase(idx);
00073 }
00074 };
00075
00076 IndexType getLength() const { return d_length; };
00077
00078
00079 int getTotalVal() const {
00080 int res=0;
00081 typename StorageType::const_iterator iter;
00082 for(iter=d_data.begin();iter!=d_data.end();++iter){
00083 res+=iter->second;
00084 }
00085 return res;
00086 };
00087
00088
00089
00090 const StorageType &getNonzeroElements() const {
00091 return d_data;
00092 }
00093
00094
00095
00096
00097
00098 SparseIntVect<IndexType> &
00099 operator&= (const SparseIntVect<IndexType> &other) {
00100 if(other.d_length!=d_length){
00101 throw ValueErrorException("SparseIntVect size mismatch");
00102 }
00103
00104 typename StorageType::iterator iter=d_data.begin();
00105 typename StorageType::const_iterator oIter=other.d_data.begin();
00106 while(iter!=d_data.end()){
00107
00108 while(oIter!=other.d_data.end() && oIter->first < iter->first){
00109 ++oIter;
00110 }
00111 if(oIter!=other.d_data.end() && oIter->first==iter->first){
00112
00113 if(oIter->second<iter->second){
00114 iter->second=oIter->second;
00115 }
00116 ++oIter;
00117 ++iter;
00118 } else {
00119
00120
00121 typename StorageType::iterator tmpIter=iter;
00122 ++tmpIter;
00123 d_data.erase(iter);
00124 iter=tmpIter;
00125 }
00126 }
00127 return *this;
00128 };
00129 const SparseIntVect<IndexType>
00130 operator& (const SparseIntVect<IndexType> &other) const {
00131 SparseIntVect<IndexType> res(*this);
00132 return res&=other;
00133 }
00134
00135
00136
00137
00138 SparseIntVect<IndexType> &
00139 operator|= (const SparseIntVect<IndexType> &other) {
00140 if(other.d_length!=d_length){
00141 throw ValueErrorException("SparseIntVect size mismatch");
00142 }
00143
00144 typename StorageType::iterator iter=d_data.begin();
00145 typename StorageType::const_iterator oIter=other.d_data.begin();
00146 while(iter!=d_data.end()){
00147
00148 while(oIter!=other.d_data.end() &&
00149 oIter->first < iter->first){
00150 d_data[oIter->first]=oIter->second;
00151 ++oIter;
00152 }
00153 if(oIter!=other.d_data.end() && oIter->first==iter->first){
00154
00155 if(oIter->second>iter->second){
00156 iter->second=oIter->second;
00157 }
00158 ++oIter;
00159 }
00160 ++iter;
00161 }
00162
00163 while(oIter!=other.d_data.end()){
00164 d_data[oIter->first]=oIter->second;
00165 ++oIter;
00166 }
00167 return *this;
00168 };
00169 const SparseIntVect<IndexType>
00170 operator| (const SparseIntVect<IndexType> &other) const {
00171 SparseIntVect<IndexType> res(*this);
00172 return res|=other;
00173 }
00174
00175 SparseIntVect<IndexType> &
00176 operator+= (const SparseIntVect<IndexType> &other) {
00177 if(other.d_length!=d_length){
00178 throw ValueErrorException("SparseIntVect size mismatch");
00179 }
00180 typename StorageType::iterator iter=d_data.begin();
00181 typename StorageType::const_iterator oIter=other.d_data.begin();
00182 while(oIter!=other.d_data.end()){
00183 while(iter!=d_data.end() &&
00184 iter->first < oIter->first){
00185 ++iter;
00186 }
00187 if(iter!=d_data.end() && oIter->first==iter->first){
00188
00189 iter->second+=oIter->second;
00190 if(!iter->second){
00191 typename StorageType::iterator tIter=iter;
00192 ++tIter;
00193 d_data.erase(iter);
00194 iter=tIter;
00195 } else {
00196 ++iter;
00197 }
00198 } else {
00199 d_data[oIter->first]=oIter->second;
00200 }
00201 ++oIter;
00202 }
00203 return *this;
00204 };
00205 const SparseIntVect<IndexType>
00206 operator+ (const SparseIntVect<IndexType> &other) const {
00207 SparseIntVect<IndexType> res(*this);
00208 return res+=other;
00209 }
00210
00211 SparseIntVect<IndexType> &
00212 operator-= (const SparseIntVect<IndexType> &other) {
00213 if(other.d_length!=d_length){
00214 throw ValueErrorException("SparseIntVect size mismatch");
00215 }
00216 typename StorageType::iterator iter=d_data.begin();
00217 typename StorageType::const_iterator oIter=other.d_data.begin();
00218 while(oIter!=other.d_data.end()){
00219 while(iter!=d_data.end() &&
00220 iter->first < oIter->first){
00221 ++iter;
00222 }
00223 if(iter!=d_data.end() && oIter->first==iter->first){
00224
00225 iter->second-=oIter->second;
00226 if(!iter->second){
00227 typename StorageType::iterator tIter=iter;
00228 ++tIter;
00229 d_data.erase(iter);
00230 iter=tIter;
00231 } else {
00232 ++iter;
00233 }
00234 } else {
00235 d_data[oIter->first]=-oIter->second;
00236 }
00237 ++oIter;
00238 }
00239 return *this;
00240 };
00241 const SparseIntVect<IndexType>
00242 operator- (const SparseIntVect<IndexType> &other) const {
00243 SparseIntVect<IndexType> res(*this);
00244 return res-=other;
00245 }
00246
00247 bool operator==(const SparseIntVect<IndexType> &v2) const{
00248 if(d_length!=v2.d_length){
00249 return false;
00250 }
00251 return d_data==v2.d_data;
00252 }
00253 bool operator!=(const SparseIntVect<IndexType> &v2) const {
00254 return !(*this==v2);
00255 }
00256
00257
00258 std::string toString() const {
00259 std::stringstream ss(std::ios_base::binary|std::ios_base::out|std::ios_base::in);
00260 boost::uint32_t tInt;
00261 tInt=ci_SPARSEINTVECT_VERSION;
00262 streamWrite(ss,tInt);
00263 tInt=sizeof(IndexType);
00264 streamWrite(ss,tInt);
00265 streamWrite(ss,d_length);
00266 IndexType nEntries=d_data.size();
00267 streamWrite(ss,nEntries);
00268
00269 typename StorageType::const_iterator iter=d_data.begin();
00270 while(iter!=d_data.end()){
00271 streamWrite(ss,iter->first);
00272 boost::int32_t tInt=iter->second;
00273 streamWrite(ss,tInt);
00274 ++iter;
00275 }
00276 return ss.str();
00277 };
00278
00279 void fromString(const std::string &txt) {
00280 initFromText(txt.c_str(),txt.length());
00281 }
00282
00283 private:
00284 IndexType d_length;
00285 StorageType d_data;
00286
00287 void initFromText(const char *pkl,const unsigned int len) {
00288 d_data.clear();
00289 std::stringstream ss(std::ios_base::binary|std::ios_base::out|std::ios_base::in);
00290 ss.write(pkl,len);
00291
00292 boost::uint32_t vers;
00293 streamRead(ss,vers);
00294 if(vers==0x0001){
00295 boost::uint32_t tInt;
00296 streamRead(ss,tInt);
00297 if(tInt>sizeof(IndexType)){
00298 throw ValueErrorException("IndexType cannot accomodate index size in SparseIntVect pickle");
00299 }
00300 switch(tInt){
00301 case sizeof(char):
00302 readVals<unsigned char>(ss);break;
00303 case sizeof(boost::int32_t):
00304 readVals<boost::uint32_t>(ss);break;
00305 case sizeof(boost::int64_t):
00306 readVals<boost::uint64_t>(ss);break;
00307 default:
00308 throw ValueErrorException("unreadable format");
00309 }
00310 } else {
00311 throw ValueErrorException("bad version in SparseIntVect pickle");
00312 }
00313 };
00314 template <typename T>
00315 void readVals(std::stringstream &ss){
00316 PRECONDITION(sizeof(T)<=sizeof(IndexType),"invalid size");
00317 T tVal;
00318 streamRead(ss,tVal);
00319 d_length=tVal;
00320 T nEntries;
00321 streamRead(ss,nEntries);
00322 for(T i=0;i<nEntries;++i){
00323 streamRead(ss,tVal);
00324 boost::int32_t val;
00325 streamRead(ss,val);
00326 d_data[tVal]=val;
00327 }
00328 }
00329 };
00330
00331 template <typename IndexType, typename SequenceType>
00332 void updateFromSequence(SparseIntVect<IndexType> &vect,
00333 const SequenceType &seq){
00334 typename SequenceType::const_iterator seqIt;
00335 for(seqIt=seq.begin();seqIt!=seq.end();++seqIt){
00336
00337 IndexType idx=*seqIt;
00338 vect.setVal(idx,vect.getVal(idx)+1);
00339 }
00340 }
00341
00342 template <typename IndexType>
00343 double DiceSimilarity(const SparseIntVect<IndexType> &v1,
00344 const SparseIntVect<IndexType> &v2,
00345 bool returnDistance=false,
00346 double bounds=0.0){
00347 if(v1.getLength()!=v2.getLength()){
00348 throw ValueErrorException("SparseIntVect size mismatch");
00349 }
00350 double v1Sum=0.0;
00351 double v2Sum=0.0;
00352 if(!returnDistance && bounds>0.0){
00353 v1Sum=v1.getTotalVal();
00354 v2Sum=v2.getTotalVal();
00355 double denom=v1Sum+v2Sum;
00356 if(fabs(denom)<1e-6){
00357 if(returnDistance){
00358 return 1.0;
00359 } else {
00360 return 0.0;
00361 }
00362 }
00363 double minV=v1Sum<v2Sum?v1Sum:v2Sum;
00364 if(2.*minV/denom<bounds){
00365 return 0.0;
00366 }
00367 v1Sum=0.0;
00368 v2Sum=0.0;
00369 }
00370
00371 double numer=0.0;
00372
00373
00374 typename SparseIntVect<IndexType>::StorageType::const_iterator iter1,iter2;
00375 iter1=v1.getNonzeroElements().begin();
00376 if(iter1!=v1.getNonzeroElements().end()) v1Sum+=iter1->second;
00377 iter2=v2.getNonzeroElements().begin();
00378 if(iter2!=v2.getNonzeroElements().end()) v2Sum+=iter2->second;
00379 while(iter1 != v1.getNonzeroElements().end()){
00380 while(iter2!=v2.getNonzeroElements().end() && iter2->first < iter1->first){
00381 ++iter2;
00382 if(iter2!=v2.getNonzeroElements().end()) v2Sum+=iter2->second;
00383 }
00384 if(iter2!=v2.getNonzeroElements().end()){
00385 if(iter2->first == iter1->first){
00386 if(iter2->second<iter1->second){
00387 numer += iter2->second;
00388 } else {
00389 numer += iter1->second;
00390 }
00391 ++iter2;
00392 if(iter2!=v2.getNonzeroElements().end()) v2Sum+=iter2->second;
00393 }
00394 ++iter1;
00395 if(iter1!=v1.getNonzeroElements().end()) v1Sum+=iter1->second;
00396 } else {
00397 break;
00398 }
00399 }
00400 if(iter1 != v1.getNonzeroElements().end()){
00401 ++iter1;
00402 while(iter1!=v1.getNonzeroElements().end()){
00403 v1Sum+=iter1->second;
00404 ++iter1;
00405 }
00406 }
00407
00408 if(iter2!=v2.getNonzeroElements().end()){
00409 ++iter2;
00410 while(iter2!=v2.getNonzeroElements().end()){
00411 v2Sum+=iter2->second;
00412 ++iter2;
00413 }
00414 }
00415 double denom=v1Sum+v2Sum;
00416 double sim;
00417 if(fabs(denom)<1e-6){
00418 sim=0.0;
00419 } else {
00420 sim=2.*numer/denom;
00421 }
00422 if(returnDistance) sim = 1.-sim;
00423
00424 return sim;
00425 }
00426 }
00427
00428
00429
00430 #endif